gpu-compute: AMD's baseline GPU model

author: Tony Gutierrez <anthony.gutierrez@amd.com> 2016-01-19 14:28:22 -0500
committer: Tony Gutierrez <anthony.gutierrez@amd.com> 2016-01-19 14:28:22 -0500
commit: 1a7d3f9fcb76a68540dd948f91413533a383bfde (patch)
tree: 867510a147cd095f19499d26b7c02d27de4cae9d
parent: 28e353e0403ea379d244a418e8dc8ee0b48187cf (diff)
download: gem5-1a7d3f9fcb76a68540dd948f91413533a383bfde.tar.xz
191 files changed, 95287 insertions, 93 deletions
diff --git a/SConstruct b/SConstruct
index eadf5d9d2..c291265fc 100755
--- a/SConstruct
+++ b/SConstruct
@@ -1065,7 +1065,9 @@ main = conf.Finish()
 
 # Define the universe of supported ISAs
 all_isa_list = [ ]
+all_gpu_isa_list = [ ]
 Export('all_isa_list')
+Export('all_gpu_isa_list')
 
 class CpuModel(object):
     '''The CpuModel class encapsulates everything the ISA parser needs to
@@ -1121,9 +1123,11 @@ for bdir in [ base_dir ] + extras_dir_list:
             SConscript(joinpath(root, 'SConsopts'))
 
 all_isa_list.sort()
+all_gpu_isa_list.sort()
 
 sticky_vars.AddVariables(
     EnumVariable('TARGET_ISA', 'Target ISA', 'alpha', all_isa_list),
+    EnumVariable('TARGET_GPU_ISA', 'Target GPU ISA', 'hsail', all_gpu_isa_list),
     ListVariable('CPU_MODELS', 'CPU models',
                  sorted(n for n,m in CpuModel.dict.iteritems() if m.default),
                  sorted(CpuModel.dict.keys())),
@@ -1139,6 +1143,7 @@ sticky_vars.AddVariables(
     BoolVariable('USE_FENV', 'Use <fenv.h> IEEE mode control', have_fenv),
     BoolVariable('CP_ANNOTATE', 'Enable critical path annotation capability', False),
     BoolVariable('USE_KVM', 'Enable hardware virtualized (KVM) CPU models', have_kvm),
+    BoolVariable('BUILD_GPU', 'Build the compute-GPU model', False),
     EnumVariable('PROTOCOL', 'Coherence protocol for Ruby', 'None',
                   all_protocols),
     EnumVariable('BACKTRACE_IMPL', 'Post-mortem dump implementation',
@@ -1146,9 +1151,9 @@ sticky_vars.AddVariables(
     )
 
 # These variables get exported to #defines in config/*.hh (see src/SConscript).
-export_vars += ['USE_FENV', 'SS_COMPATIBLE_FP', 'TARGET_ISA', 'CP_ANNOTATE',
-                'USE_POSIX_CLOCK', 'USE_KVM', 'PROTOCOL', 'HAVE_PROTOBUF',
-                'HAVE_PERF_ATTR_EXCLUDE_HOST']
+export_vars += ['USE_FENV', 'SS_COMPATIBLE_FP', 'TARGET_ISA', 'TARGET_GPU_ISA',
+                'CP_ANNOTATE', 'USE_POSIX_CLOCK', 'USE_KVM', 'PROTOCOL',
+                'HAVE_PROTOBUF', 'HAVE_PERF_ATTR_EXCLUDE_HOST']
 
 ###################################################
 #
@@ -1226,6 +1231,7 @@ main.SConscript('ext/nomali/SConscript',
 ###################################################
 
 main['ALL_ISA_LIST'] = all_isa_list
+main['ALL_GPU_ISA_LIST'] = all_gpu_isa_list
 all_isa_deps = {}
 def make_switching_dir(dname, switch_headers, env):
     # Generate the header.  target[0] is the full path of the output
@@ -1258,6 +1264,35 @@ def make_switching_dir(dname, switch_headers, env):
 
 Export('make_switching_dir')
 
+def make_gpu_switching_dir(dname, switch_headers, env):
+    # Generate the header.  target[0] is the full path of the output
+    # header to generate.  'source' is a dummy variable, since we get the
+    # list of ISAs from env['ALL_ISA_LIST'].
+    def gen_switch_hdr(target, source, env):
+        fname = str(target[0])
+
+        isa = env['TARGET_GPU_ISA'].lower()
+
+        try:
+            f = open(fname, 'w')
+            print >>f, '#include "%s/%s/%s"' % (dname, isa, basename(fname))
+            f.close()
+        except IOError:
+            print "Failed to create %s" % fname
+            raise
+
+    # Build SCons Action object. 'varlist' specifies env vars that this
+    # action depends on; when env['ALL_ISA_LIST'] changes these actions
+    # should get re-executed.
+    switch_hdr_action = MakeAction(gen_switch_hdr,
+                          Transform("GENERATE"), varlist=['ALL_ISA_GPU_LIST'])
+
+    # Instantiate actions for each header
+    for hdr in switch_headers:
+        env.Command(hdr, [], switch_hdr_action)
+
+Export('make_gpu_switching_dir')
+
 # all-isas -> all-deps -> all-environs -> all_targets
 main.Alias('#all-isas', [])
 main.Alias('#all-deps', '#all-isas')
diff --git a/build_opts/HSAIL_X86 b/build_opts/HSAIL_X86
new file mode 100644
index 000000000..105f82cbd
--- /dev/null
+++ b/build_opts/HSAIL_X86
@@ -0,0 +1,5 @@
+PROTOCOL = 'GPU_RfO'
+TARGET_ISA = 'x86'
+TARGET_GPU_ISA = 'hsail'
+BUILD_GPU = True
+CPU_MODELS = 'AtomicSimpleCPU,O3CPU,TimingSimpleCPU'
diff --git a/build_opts/X86_MOESI_AMD_Base b/build_opts/X86_MOESI_AMD_Base
new file mode 100644
index 000000000..e85f36d82
--- /dev/null
+++ b/build_opts/X86_MOESI_AMD_Base
@@ -0,0 +1,3 @@
+PROTOCOL = 'MOESI_AMD_Base'
+TARGET_ISA = 'x86'
+CPU_MODELS = 'AtomicSimpleCPU,O3CPU,TimingSimpleCPU'
+\ No newline at end of file
diff --git a/configs/common/GPUTLBConfig.py b/configs/common/GPUTLBConfig.py
new file mode 100644
index 000000000..b7ea6dcf1
--- /dev/null
+++ b/configs/common/GPUTLBConfig.py
@@ -0,0 +1,203 @@
+#
+#  Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Lisa Hsu
+#
+
+# Configure the TLB hierarchy
+# Places which would probably need to be modified if you
+# want a different hierarchy are specified by a <Modify here .. >'
+# comment
+import m5
+from m5.objects import *
+
+def TLB_constructor(level):
+
+    constructor_call = "X86GPUTLB(size = options.L%(level)dTLBentries, \
+            assoc = options.L%(level)dTLBassoc, \
+            hitLatency = options.L%(level)dAccessLatency,\
+            missLatency2 = options.L%(level)dMissLatency,\
+            maxOutstandingReqs = options.L%(level)dMaxOutstandingReqs,\
+            accessDistance = options.L%(level)dAccessDistanceStat,\
+            clk_domain = SrcClockDomain(\
+                clock = options.GPUClock,\
+                voltage_domain = VoltageDomain(\
+                    voltage = options.gpu_voltage)))" % locals()
+    return constructor_call
+
+def Coalescer_constructor(level):
+
+    constructor_call = "TLBCoalescer(probesPerCycle = \
+                options.L%(level)dProbesPerCycle, \
+                coalescingWindow = options.L%(level)dCoalescingWindow,\
+                disableCoalescing = options.L%(level)dDisableCoalescing,\
+                clk_domain = SrcClockDomain(\
+                    clock = options.GPUClock,\
+                    voltage_domain = VoltageDomain(\
+                        voltage = options.gpu_voltage)))" % locals()
+    return constructor_call
+
+def create_TLB_Coalescer(options, my_level, my_index, TLB_name, Coalescer_name):
+    # arguments: options, TLB level, number of private structures for this Level,
+    # TLB name and  Coalescer name
+    for i in xrange(my_index):
+        TLB_name.append(eval(TLB_constructor(my_level)))
+        Coalescer_name.append(eval(Coalescer_constructor(my_level)))
+
+def config_tlb_hierarchy(options, system, shader_idx):
+    n_cu = options.num_compute_units
+    # Make this configurable now, instead of the hard coded val.  The dispatcher
+    # is always the last item in the system.cpu list.
+    dispatcher_idx = len(system.cpu) - 1
+
+    if options.TLB_config == "perLane":
+        num_TLBs = 64 * n_cu
+    elif options.TLB_config == "mono":
+        num_TLBs = 1
+    elif options.TLB_config == "perCU":
+        num_TLBs = n_cu
+    elif options.TLB_config == "2CU":
+        num_TLBs = n_cu >> 1
+    else:
+        print "Bad option for TLB Configuration."
+        sys.exit(1)
+
+    #----------------------------------------------------------------------------------------
+    # A visual representation of the TLB hierarchy
+    # for ease of configuration
+    # < Modify here the width and the number of levels if you want a different configuration >
+    # width is the number of TLBs of the given type (i.e., D-TLB, I-TLB etc) for this level
+    L1 = [{'name': 'sqc', 'width': options.num_sqc, 'TLBarray': [], 'CoalescerArray': []},
+          {'name': 'dispatcher', 'width': 1, 'TLBarray': [], 'CoalescerArray': []},
+          {'name': 'l1', 'width': num_TLBs, 'TLBarray': [], 'CoalescerArray': []}]
+
+    L2 = [{'name': 'l2', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]
+    L3 = [{'name': 'l3', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]
+
+    TLB_hierarchy = [L1, L2, L3]
+
+    #----------------------------------------------------------------------------------------
+    # Create the hiearchy
+    # Call the appropriate constructors and add objects to the system
+
+    for i in xrange(len(TLB_hierarchy)):
+        hierarchy_level = TLB_hierarchy[i]
+        level = i+1
+        for TLB_type in hierarchy_level:
+            TLB_index = TLB_type['width']
+            TLB_array = TLB_type['TLBarray']
+            Coalescer_array = TLB_type['CoalescerArray']
+            # If the sim calls for a fixed L1 TLB size across CUs,
+            # override the TLB entries option
+            if options.tot_L1TLB_size:
+                options.L1TLBentries = options.tot_L1TLB_size / num_TLBs
+                if options.L1TLBassoc > options.L1TLBentries:
+                    options.L1TLBassoc = options.L1TLBentries
+            # call the constructors for the TLB and the Coalescer
+            create_TLB_Coalescer(options, level, TLB_index,\
+                TLB_array, Coalescer_array)
+
+            system_TLB_name = TLB_type['name'] + '_tlb'
+            system_Coalescer_name = TLB_type['name'] + '_coalescer'
+
+            # add the different TLB levels to the system
+            # Modify here if you want to make the TLB hierarchy a child of
+            # the shader.
+            exec('system.%s = TLB_array' % system_TLB_name)
+            exec('system.%s = Coalescer_array' % system_Coalescer_name)
+
+    #===========================================================
+    # Specify the TLB hierarchy (i.e., port connections)
+    # All TLBs but the last level TLB need to have a memSidePort (master)
+    #===========================================================
+
+    # Each TLB is connected with its Coalescer through a single port.
+    # There is a one-to-one mapping of TLBs to Coalescers at a given level
+    # This won't be modified no matter what the hierarchy looks like.
+    for i in xrange(len(TLB_hierarchy)):
+        hierarchy_level = TLB_hierarchy[i]
+        level = i+1
+        for TLB_type in hierarchy_level:
+            name = TLB_type['name']
+            for index in range(TLB_type['width']):
+                exec('system.%s_coalescer[%d].master[0] = \
+                        system.%s_tlb[%d].slave[0]' % \
+                        (name, index, name, index))
+
+    # Connect the cpuSidePort (slave) of all the coalescers in level 1
+    # < Modify here if you want a different configuration >
+    for TLB_type in L1:
+        name = TLB_type['name']
+        num_TLBs = TLB_type['width']
+        if name == 'l1':     # L1 D-TLBs
+            tlb_per_cu = num_TLBs / n_cu
+            for cu_idx in range(n_cu):
+                if tlb_per_cu:
+                    for tlb in range(tlb_per_cu):
+                        exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
+                                system.l1_coalescer[%d].slave[%d]' % \
+                                (shader_idx, cu_idx, tlb, cu_idx*tlb_per_cu+tlb, 0))
+                else:
+                    exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
+                            system.l1_coalescer[%d].slave[%d]' % \
+                            (shader_idx, cu_idx, tlb_per_cu, cu_idx / (n_cu / num_TLBs), cu_idx % (n_cu / num_TLBs)))
+
+        elif name == 'dispatcher': # Dispatcher TLB
+            for index in range(TLB_type['width']):
+                exec('system.cpu[%d].translation_port = \
+                        system.dispatcher_coalescer[%d].slave[0]' % \
+                        (dispatcher_idx, index))
+        elif name == 'sqc': # I-TLB
+            for index in range(n_cu):
+                sqc_tlb_index = index / options.cu_per_sqc
+                sqc_tlb_port_id = index % options.cu_per_sqc
+                exec('system.cpu[%d].CUs[%d].sqc_tlb_port = \
+                        system.sqc_coalescer[%d].slave[%d]' % \
+                        (shader_idx, index, sqc_tlb_index, sqc_tlb_port_id))
+
+
+    # Connect the memSidePorts (masters) of all the TLBs with the
+    # cpuSidePorts (slaves) of the Coalescers of the next level
+    # < Modify here if you want a different configuration >
+    # L1 <-> L2
+    l2_coalescer_index = 0
+    for TLB_type in L1:
+        name = TLB_type['name']
+        for index in range(TLB_type['width']):
+            exec('system.%s_tlb[%d].master[0] = \
+                    system.l2_coalescer[0].slave[%d]' % \
+                    (name, index, l2_coalescer_index))
+            l2_coalescer_index += 1
+    # L2 <-> L3
+    system.l2_tlb[0].master[0] = system.l3_coalescer[0].slave[0]
+
+    return system
diff --git a/configs/common/GPUTLBOptions.py b/configs/common/GPUTLBOptions.py
new file mode 100644
index 000000000..40a46d560
--- /dev/null
+++ b/configs/common/GPUTLBOptions.py
@@ -0,0 +1,109 @@
+#
+#  Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Myrto Papadopoulou
+#
+
+def tlb_options(parser):
+
+    #===================================================================
+    # TLB Configuration
+    #===================================================================
+
+    parser.add_option("--TLB-config", type="string", default="perCU",
+            help="Options are: perCU (default), mono, 2CU, or perLane")
+
+    #===================================================================
+    #   L1 TLB Options (D-TLB, I-TLB, Dispatcher-TLB)
+    #===================================================================
+
+    parser.add_option("--L1TLBentries", type='int', default="32")
+    parser.add_option("--L1TLBassoc", type='int', default="32")
+    parser.add_option("--L1AccessLatency", type='int', default="1",
+                      help="latency in gpu cycles")
+    parser.add_option("--L1MissLatency", type='int', default="750",
+                      help="latency (in gpu cycles) of a page walk, "
+                      "if this is a last level TLB")
+    parser.add_option("--L1MaxOutstandingReqs", type='int', default="64")
+    parser.add_option("--L1AccessDistanceStat", action="store_true")
+    parser.add_option("--tot-L1TLB-size", type="int", default="0")
+
+    #===================================================================
+    #   L2 TLB Options
+    #===================================================================
+
+    parser.add_option("--L2TLBentries", type='int', default="4096")
+    parser.add_option("--L2TLBassoc", type='int', default="32")
+    parser.add_option("--L2AccessLatency", type='int', default="69",
+                      help="latency in gpu cycles")
+    parser.add_option("--L2MissLatency", type='int', default="750",
+                      help="latency (in gpu cycles) of a page walk, "
+                      "if this is a last level TLB")
+    parser.add_option("--L2MaxOutstandingReqs", type='int', default="64")
+    parser.add_option("--L2AccessDistanceStat", action="store_true")
+
+    #===================================================================
+    #   L3 TLB Options
+    #===================================================================
+
+    parser.add_option("--L3TLBentries", type='int', default="8192")
+    parser.add_option("--L3TLBassoc", type='int', default="32")
+    parser.add_option("--L3AccessLatency", type='int', default="150",
+                      help="latency in gpu cycles")
+    parser.add_option("--L3MissLatency", type='int', default="750",
+                      help="latency (in gpu cycles) of a page walk")
+    parser.add_option("--L3MaxOutstandingReqs", type='int', default="64")
+    parser.add_option("--L3AccessDistanceStat", action="store_true")
+
+    #===================================================================
+    #   L1 TLBCoalescer Options
+    #===================================================================
+
+    parser.add_option("--L1ProbesPerCycle", type='int', default="2")
+    parser.add_option("--L1CoalescingWindow", type='int', default="1")
+    parser.add_option("--L1DisableCoalescing", action="store_true")
+
+    #===================================================================
+    #   L2 TLBCoalescer Options
+    #===================================================================
+
+    parser.add_option("--L2ProbesPerCycle", type='int', default="2")
+    parser.add_option("--L2CoalescingWindow", type='int', default="1")
+    parser.add_option("--L2DisableCoalescing", action="store_true")
+
+    #===================================================================
+    #   L3 TLBCoalescer Options
+    #===================================================================
+
+    parser.add_option("--L3ProbesPerCycle", type='int', default="2")
+    parser.add_option("--L3CoalescingWindow", type='int', default="1")
+    parser.add_option("--L3DisableCoalescing", action="store_true")
diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
new file mode 100644
index 000000000..75819b505
--- /dev/null
+++ b/configs/example/apu_se.py
@@ -0,0 +1,499 @@
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Sooraj Puthoor
+#
+
+import optparse, os, re
+import math
+import glob
+import inspect
+
+import m5
+from m5.objects import *
+from m5.util import addToPath
+
+addToPath('../ruby')
+addToPath('../common')
+addToPath('../topologies')
+
+import Options
+import Ruby
+import Simulation
+import GPUTLBOptions, GPUTLBConfig
+
+########################## Script Options ########################
+def setOption(parser, opt_str, value = 1):
+    # check to make sure the option actually exists
+    if not parser.has_option(opt_str):
+        raise Exception("cannot find %s in list of possible options" % opt_str)
+
+    opt = parser.get_option(opt_str)
+    # set the value
+    exec("parser.values.%s = %s" % (opt.dest, value))
+
+def getOption(parser, opt_str):
+    # check to make sure the option actually exists
+    if not parser.has_option(opt_str):
+        raise Exception("cannot find %s in list of possible options" % opt_str)
+
+    opt = parser.get_option(opt_str)
+    # get the value
+    exec("return_value = parser.values.%s" % opt.dest)
+    return return_value
+
+# Adding script options
+parser = optparse.OptionParser()
+Options.addCommonOptions(parser)
+Options.addSEOptions(parser)
+
+parser.add_option("--cpu-only-mode", action="store_true", default=False,
+                  help="APU mode. Used to take care of problems in "\
+                       "Ruby.py while running APU protocols")
+parser.add_option("-k", "--kernel-files",
+                  help="file(s) containing GPU kernel code (colon separated)")
+parser.add_option("-u", "--num-compute-units", type="int", default=1,
+                  help="number of GPU compute units"),
+parser.add_option("--num-cp", type="int", default=0,
+                  help="Number of GPU Command Processors (CP)")
+parser.add_option("--benchmark-root", help="Root of benchmark directory tree")
+
+# not super important now, but to avoid putting the number 4 everywhere, make
+# it an option/knob
+parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \
+                  "sharing an SQC (icache, and thus icache TLB)")
+parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
+                  "per CU")
+parser.add_option("--wf-size", type="int", default=64,
+                  help="Wavefront size(in workitems)")
+parser.add_option("--sp-bypass-path-length", type="int", default=4, \
+                  help="Number of stages of bypass path in vector ALU for Single Precision ops")
+parser.add_option("--dp-bypass-path-length", type="int", default=4, \
+                  help="Number of stages of bypass path in vector ALU for Double Precision ops")
+# issue period per SIMD unit: number of cycles before issuing another vector
+parser.add_option("--issue-period", type="int", default=4, \
+                  help="Number of cycles per vector instruction issue period")
+parser.add_option("--glbmem-wr-bus-width", type="int", default=32, \
+                  help="VGPR to Coalescer (Global Memory) data bus width in bytes")
+parser.add_option("--glbmem-rd-bus-width", type="int", default=32, \
+                  help="Coalescer to VGPR (Global Memory) data bus width in bytes")
+# Currently we only support 1 local memory pipe
+parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \
+                  help="Number of Shared Memory pipelines per CU")
+# Currently we only support 1 global memory pipe
+parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \
+                  help="Number of Global Memory pipelines per CU")
+parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
+                  "WF slots per SIMD")
+
+parser.add_option("--vreg-file-size", type="int", default=2048,
+                  help="number of physical vector registers per SIMD")
+parser.add_option("--bw-scalor", type="int", default=0,
+                  help="bandwidth scalor for scalability analysis")
+parser.add_option("--CPUClock", type="string", default="2GHz",
+                  help="CPU clock")
+parser.add_option("--GPUClock", type="string", default="1GHz",
+                  help="GPU clock")
+parser.add_option("--cpu-voltage", action="store", type="string",
+                  default='1.0V',
+                  help = """CPU  voltage domain""")
+parser.add_option("--gpu-voltage", action="store", type="string",
+                  default='1.0V',
+                  help = """CPU  voltage domain""")
+parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST",
+                  help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)")
+parser.add_option("--xact-cas-mode", action="store_true",
+                  help="enable load_compare mode (transactional CAS)")
+parser.add_option("--SegFaultDebug",action="store_true",
+                 help="checks for GPU seg fault before TLB access")
+parser.add_option("--FunctionalTLB",action="store_true",
+                 help="Assumes TLB has no latency")
+parser.add_option("--LocalMemBarrier",action="store_true",
+                 help="Barrier does not wait for writethroughs to complete")
+parser.add_option("--countPages", action="store_true",
+                 help="Count Page Accesses and output in per-CU output files")
+parser.add_option("--TLB-prefetch", type="int", help = "prefetch depth for"\
+                  "TLBs")
+parser.add_option("--pf-type", type="string", help="type of prefetch: "\
+                  "PF_CU, PF_WF, PF_PHASE, PF_STRIDE")
+parser.add_option("--pf-stride", type="int", help="set prefetch stride")
+parser.add_option("--numLdsBanks", type="int", default=32,
+                  help="number of physical banks per LDS module")
+parser.add_option("--ldsBankConflictPenalty", type="int", default=1,
+                  help="number of cycles per LDS bank conflict")
+
+
+Ruby.define_options(parser)
+
+#add TLB options to the parser
+GPUTLBOptions.tlb_options(parser)
+
+(options, args) = parser.parse_args()
+
+# The GPU cache coherence protocols only work with the backing store
+setOption(parser, "--access-backing-store")
+
+# if benchmark root is specified explicitly, that overrides the search path
+if options.benchmark_root:
+    benchmark_path = [options.benchmark_root]
+else:
+    # Set default benchmark search path to current dir
+    benchmark_path = ['.']
+
+########################## Sanity Check ########################
+
+# Currently the gpu model requires ruby
+if buildEnv['PROTOCOL'] == 'None':
+    fatal("GPU model requires ruby")
+
+# Currently the gpu model requires only timing or detailed CPU
+if not (options.cpu_type == "timing" or
+   options.cpu_type == "detailed"):
+    fatal("GPU model requires timing or detailed CPU")
+
+# This file can support multiple compute units
+assert(options.num_compute_units >= 1)
+
+# Currently, the sqc (I-Cache of GPU) is shared by
+# multiple compute units(CUs). The protocol works just fine
+# even if sqc is not shared. Overriding this option here
+# so that the user need not explicitly set this (assuming
+# sharing sqc is the common usage)
+n_cu = options.num_compute_units
+num_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc))
+options.num_sqc = num_sqc # pass this to Ruby
+
+########################## Creating the GPU system ########################
+# shader is the GPU
+shader = Shader(n_wf = options.wfs_per_simd,
+                clk_domain = SrcClockDomain(
+                    clock = options.GPUClock,
+                    voltage_domain = VoltageDomain(
+                        voltage = options.gpu_voltage)))
+
+# GPU_RfO(Read For Ownership) implements SC/TSO memory model.
+# Other GPU protocols implement release consistency at GPU side.
+# So, all GPU protocols other than GPU_RfO should make their writes
+# visible to the global memory and should read from global memory
+# during kernal boundary. The pipeline initiates(or do not initiate)
+# the acquire/release operation depending on this impl_kern_boundary_sync
+# flag. This flag=true means pipeline initiates a acquire/release operation
+# at kernel boundary.
+if buildEnv['PROTOCOL'] == 'GPU_RfO':
+    shader.impl_kern_boundary_sync = False
+else:
+    shader.impl_kern_boundary_sync = True
+
+# Switching off per-lane TLB by default
+per_lane = False
+if options.TLB_config == "perLane":
+    per_lane = True
+
+# List of compute units; one GPU can have multiple compute units
+compute_units = []
+for i in xrange(n_cu):
+    compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane,
+                                     num_SIMDs = options.simds_per_cu,
+                                     wfSize = options.wf_size,
+                                     spbypass_pipe_length = options.sp_bypass_path_length,
+                                     dpbypass_pipe_length = options.dp_bypass_path_length,
+                                     issue_period = options.issue_period,
+                                     coalescer_to_vrf_bus_width = \
+                                     options.glbmem_rd_bus_width,
+                                     vrf_to_coalescer_bus_width = \
+                                     options.glbmem_wr_bus_width,
+                                     num_global_mem_pipes = \
+                                     options.glb_mem_pipes_per_cu,
+                                     num_shared_mem_pipes = \
+                                     options.shr_mem_pipes_per_cu,
+                                     n_wf = options.wfs_per_simd,
+                                     execPolicy = options.CUExecPolicy,
+                                     xactCasMode = options.xact_cas_mode,
+                                     debugSegFault = options.SegFaultDebug,
+                                     functionalTLB = options.FunctionalTLB,
+                                     localMemBarrier = options.LocalMemBarrier,
+                                     countPages = options.countPages,
+                                     localDataStore = \
+                                     LdsState(banks = options.numLdsBanks,
+                                              bankConflictPenalty = \
+                                              options.ldsBankConflictPenalty)))
+    wavefronts = []
+    vrfs = []
+    for j in xrange(options.simds_per_cu):
+        for k in xrange(shader.n_wf):
+            wavefronts.append(Wavefront(simdId = j, wf_slot_id = k))
+        vrfs.append(VectorRegisterFile(simd_id=j,
+                              num_regs_per_simd=options.vreg_file_size))
+    compute_units[-1].wavefronts = wavefronts
+    compute_units[-1].vector_register_file = vrfs
+    if options.TLB_prefetch:
+        compute_units[-1].prefetch_depth = options.TLB_prefetch
+        compute_units[-1].prefetch_prev_type = options.pf_type
+
+    # attach the LDS and the CU to the bus (actually a Bridge)
+    compute_units[-1].ldsPort = compute_units[-1].ldsBus.slave
+    compute_units[-1].ldsBus.master = compute_units[-1].localDataStore.cuPort
+
+# Attach compute units to GPU
+shader.CUs = compute_units
+
+########################## Creating the CPU system ########################
+options.num_cpus = options.num_cpus
+
+# The shader core will be whatever is after the CPU cores are accounted for
+shader_idx = options.num_cpus
+
+# The command processor will be whatever is after the shader is accounted for
+cp_idx = shader_idx + 1
+cp_list = []
+
+# List of CPUs
+cpu_list = []
+
+# We only support timing mode for shader and memory
+shader.timing = True
+mem_mode = 'timing'
+
+# create the cpus
+for i in range(options.num_cpus):
+    cpu = None
+    if options.cpu_type == "detailed":
+        cpu = DerivO3CPU(cpu_id=i,
+                         clk_domain = SrcClockDomain(
+                             clock = options.CPUClock,
+                             voltage_domain = VoltageDomain(
+                                 voltage = options.cpu_voltage)))
+    elif options.cpu_type == "timing":
+        cpu = TimingSimpleCPU(cpu_id=i,
+                              clk_domain = SrcClockDomain(
+                                  clock = options.CPUClock,
+                                  voltage_domain = VoltageDomain(
+                                      voltage = options.cpu_voltage)))
+    else:
+        fatal("Atomic CPU not supported/tested")
+    cpu_list.append(cpu)
+
+# create the command processors
+for i in xrange(options.num_cp):
+    cp = None
+    if options.cpu_type == "detailed":
+        cp = DerivO3CPU(cpu_id = options.num_cpus + i,
+                        clk_domain = SrcClockDomain(
+                            clock = options.CPUClock,
+                            voltage_domain = VoltageDomain(
+                                voltage = options.cpu_voltage)))
+    elif options.cpu_type == 'timing':
+        cp = TimingSimpleCPU(cpu_id=options.num_cpus + i,
+                             clk_domain = SrcClockDomain(
+                                 clock = options.CPUClock,
+                                 voltage_domain = VoltageDomain(
+                                     voltage = options.cpu_voltage)))
+    else:
+        fatal("Atomic CPU not supported/tested")
+    cp_list = cp_list + [cp]
+
+########################## Creating the GPU dispatcher ########################
+# Dispatcher dispatches work from host CPU to GPU
+host_cpu = cpu_list[0]
+dispatcher = GpuDispatcher()
+
+########################## Create and assign the workload ########################
+# Check for rel_path in elements of base_list using test, returning
+# the first full path that satisfies test
+def find_path(base_list, rel_path, test):
+    for base in base_list:
+        if not base:
+            # base could be None if environment var not set
+            continue
+        full_path = os.path.join(base, rel_path)
+        if test(full_path):
+            return full_path
+    fatal("%s not found in %s" % (rel_path, base_list))
+
+def find_file(base_list, rel_path):
+    return find_path(base_list, rel_path, os.path.isfile)
+
+executable = find_path(benchmark_path, options.cmd, os.path.exists)
+# it's common for a benchmark to be in a directory with the same
+# name as the executable, so we handle that automatically
+if os.path.isdir(executable):
+    benchmark_path = [executable]
+    executable = find_file(benchmark_path, options.cmd)
+if options.kernel_files:
+    kernel_files = [find_file(benchmark_path, f)
+                    for f in options.kernel_files.split(':')]
+else:
+    # if kernel_files is not set, see if there's a unique .asm file
+    # in the same directory as the executable
+    kernel_path = os.path.dirname(executable)
+    kernel_files = glob.glob(os.path.join(kernel_path, '*.asm'))
+    if kernel_files:
+        print "Using GPU kernel code file(s)", ",".join(kernel_files)
+    else:
+        fatal("Can't locate kernel code (.asm) in " + kernel_path)
+
+# OpenCL driver
+driver = ClDriver(filename="hsa", codefile=kernel_files)
+for cpu in cpu_list:
+    cpu.workload = LiveProcess(executable = executable,
+                               cmd = [options.cmd] + options.options.split(),
+                               drivers = [driver])
+for cp in cp_list:
+    cp.workload = host_cpu.workload
+
+########################## Create the overall system ########################
+# Full list of processing cores in the system. Note that
+# dispatcher is also added to cpu_list although it is
+# not a processing element
+cpu_list = cpu_list + [shader] + cp_list + [dispatcher]
+
+# creating the overall system
+# notice the cpu list is explicitly added as a parameter to System
+system = System(cpu = cpu_list,
+                mem_ranges = [AddrRange(options.mem_size)],
+                cache_line_size = options.cacheline_size,
+                mem_mode = mem_mode)
+system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
+system.clk_domain = SrcClockDomain(clock =  options.sys_clock,
+                                   voltage_domain = system.voltage_domain)
+
+# configure the TLB hierarchy
+GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx)
+
+# create Ruby system
+system.piobus = IOXBar(width=32, response_latency=0,
+                       frontend_latency=0, forward_latency=0)
+Ruby.create_system(options, None, system)
+system.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock,
+                                    voltage_domain = system.voltage_domain)
+
+# attach the CPU ports to Ruby
+for i in range(options.num_cpus):
+    ruby_port = system.ruby._cpu_ports[i]
+
+    # Create interrupt controller
+    system.cpu[i].createInterruptController()
+
+    # Connect cache port's to ruby
+    system.cpu[i].icache_port = ruby_port.slave
+    system.cpu[i].dcache_port = ruby_port.slave
+
+    ruby_port.mem_master_port = system.piobus.slave
+    if buildEnv['TARGET_ISA'] == "x86":
+        system.cpu[i].interrupts[0].pio = system.piobus.master
+        system.cpu[i].interrupts[0].int_master = system.piobus.slave
+        system.cpu[i].interrupts[0].int_slave = system.piobus.master
+
+# attach CU ports to Ruby
+# Because of the peculiarities of the CP core, you may have 1 CPU but 2
+# sequencers and thus 2 _cpu_ports created. Your GPUs shouldn't be
+# hooked up until after the CP. To make this script generic, figure out
+# the index as below, but note that this assumes there is one sequencer
+# per compute unit and one sequencer per SQC for the math to work out
+# correctly.
+gpu_port_idx = len(system.ruby._cpu_ports) \
+               - options.num_compute_units - options.num_sqc
+gpu_port_idx = gpu_port_idx - options.num_cp * 2
+
+wavefront_size = options.wf_size
+for i in xrange(n_cu):
+    # The pipeline issues wavefront_size number of uncoalesced requests
+    # in one GPU issue cycle. Hence wavefront_size mem ports.
+    for j in xrange(wavefront_size):
+        system.cpu[shader_idx].CUs[i].memory_port[j] = \
+                  system.ruby._cpu_ports[gpu_port_idx].slave[j]
+    gpu_port_idx += 1
+
+for i in xrange(n_cu):
+    if i > 0 and not i % options.cu_per_sqc:
+        print "incrementing idx on ", i
+        gpu_port_idx += 1
+    system.cpu[shader_idx].CUs[i].sqc_port = \
+            system.ruby._cpu_ports[gpu_port_idx].slave
+gpu_port_idx = gpu_port_idx + 1
+
+# attach CP ports to Ruby
+for i in xrange(options.num_cp):
+    system.cpu[cp_idx].createInterruptController()
+    system.cpu[cp_idx].dcache_port = \
+                system.ruby._cpu_ports[gpu_port_idx + i * 2].slave
+    system.cpu[cp_idx].icache_port = \
+                system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave
+    system.cpu[cp_idx].interrupts[0].pio = system.piobus.master
+    system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave
+    system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master
+    cp_idx = cp_idx + 1
+
+# connect dispatcher to the system.piobus
+dispatcher.pio = system.piobus.master
+dispatcher.dma = system.piobus.slave
+
+################# Connect the CPU and GPU via GPU Dispatcher ###################
+# CPU rings the GPU doorbell to notify a pending task
+# using this interface.
+# And GPU uses this interface to notify the CPU of task completion
+# The communcation happens through emulated driver.
+
+# Note this implicit setting of the cpu_pointer, shader_pointer and tlb array
+# parameters must be after the explicit setting of the System cpu list
+shader.cpu_pointer = host_cpu
+dispatcher.cpu = host_cpu
+dispatcher.shader_pointer = shader
+dispatcher.cl_driver = driver
+
+########################## Start simulation ########################
+
+root = Root(system=system, full_system=False)
+m5.ticks.setGlobalFrequency('1THz')
+if options.abs_max_tick:
+    maxtick = options.abs_max_tick
+else:
+    maxtick = m5.MaxTick
+
+# Benchmarks support work item annotations
+Simulation.setWorkCountOptions(system, options)
+
+# Checkpointing is not supported by APU model
+if (options.checkpoint_dir != None or
+    options.checkpoint_restore != None):
+    fatal("Checkpointing not supported by apu model")
+
+checkpoint_dir = None
+m5.instantiate(checkpoint_dir)
+
+# Map workload to this address space
+host_cpu.workload[0].map(0x10000000, 0x200000000, 4096)
+
+exit_event = m5.simulate(maxtick)
+print "Ticks:", m5.curTick()
+print 'Exiting because ', exit_event.getCause()
+sys.exit(exit_event.getCode())
diff --git a/configs/example/ruby_gpu_random_test.py b/configs/example/ruby_gpu_random_test.py
new file mode 100644
index 000000000..66ee4675f
--- /dev/null
+++ b/configs/example/ruby_gpu_random_test.py
@@ -0,0 +1,187 @@
+#
+#  Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Brad Beckmann
+#
+
+import m5
+from m5.objects import *
+from m5.defines import buildEnv
+from m5.util import addToPath
+import os, optparse, sys
+addToPath('../common')
+addToPath('../ruby')
+addToPath('../topologies')
+
+import Options
+import Ruby
+
+# Get paths we might need.
+config_path = os.path.dirname(os.path.abspath(__file__))
+config_root = os.path.dirname(config_path)
+m5_root = os.path.dirname(config_root)
+
+parser = optparse.OptionParser()
+Options.addCommonOptions(parser)
+
+parser.add_option("--maxloads", metavar="N", default=100,
+                  help="Stop after N loads")
+parser.add_option("-f", "--wakeup_freq", metavar="N", default=10,
+                  help="Wakeup every N cycles")
+parser.add_option("-u", "--num-compute-units", type="int", default=1,
+                  help="number of compute units in the GPU")
+parser.add_option("--numCPs", type="int", default=0,
+                  help="Number of GPU Command Processors (CP)")
+# not super important now, but to avoid putting the number 4 everywhere, make
+# it an option/knob
+parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs \
+                  sharing an SQC (icache, and thus icache TLB)")
+parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
+                  "per CU")
+parser.add_option("--wf-size", type="int", default=64,
+                  help="Wavefront size(in workitems)")
+parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
+                  "WF slots per SIMD")
+
+#
+# Add the ruby specific and protocol specific options
+#
+Ruby.define_options(parser)
+
+execfile(os.path.join(config_root, "common", "Options.py"))
+
+(options, args) = parser.parse_args()
+
+#
+# Set the default cache size and associativity to be very small to encourage
+# races between requests and writebacks.
+#
+options.l1d_size="256B"
+options.l1i_size="256B"
+options.l2_size="512B"
+options.l3_size="1kB"
+options.l1d_assoc=2
+options.l1i_assoc=2
+options.l2_assoc=2
+options.l3_assoc=2
+
+# This file can support multiple compute units
+assert(options.num_compute_units >= 1)
+n_cu = options.num_compute_units
+
+options.num_sqc = int((n_cu + options.cu_per_sqc - 1) / options.cu_per_sqc)
+
+if args:
+     print "Error: script doesn't take any positional arguments"
+     sys.exit(1)
+
+#
+# Create the ruby random tester
+#
+
+# Check to for the GPU_RfO protocol.  Other GPU protocols are non-SC and will
+# not work with the Ruby random tester.
+assert(buildEnv['PROTOCOL'] == 'GPU_RfO')
+
+# The GPU_RfO protocol does not support cache flushes
+check_flush = False
+
+tester = RubyTester(check_flush=check_flush,
+                    checks_to_complete=options.maxloads,
+                    wakeup_frequency=options.wakeup_freq,
+                    deadlock_threshold=1000000)
+
+#
+# Create the M5 system.  Note that the Memory Object isn't
+# actually used by the rubytester, but is included to support the
+# M5 memory size == Ruby memory size checks
+#
+system = System(cpu=tester, mem_ranges=[AddrRange(options.mem_size)])
+
+# Create a top-level voltage domain and clock domain
+system.voltage_domain = VoltageDomain(voltage=options.sys_voltage)
+
+system.clk_domain = SrcClockDomain(clock=options.sys_clock,
+                                   voltage_domain=system.voltage_domain)
+
+Ruby.create_system(options, False, system)
+
+# Create a seperate clock domain for Ruby
+system.ruby.clk_domain = SrcClockDomain(clock=options.ruby_clock,
+                                       voltage_domain=system.voltage_domain)
+
+tester.num_cpus = len(system.ruby._cpu_ports)
+
+#
+# The tester is most effective when randomization is turned on and
+# artifical delay is randomly inserted on messages
+#
+system.ruby.randomization = True
+
+for ruby_port in system.ruby._cpu_ports:
+
+    #
+    # Tie the ruby tester ports to the ruby cpu read and write ports
+    #
+    if ruby_port.support_data_reqs and ruby_port.support_inst_reqs:
+        tester.cpuInstDataPort = ruby_port.slave
+    elif ruby_port.support_data_reqs:
+        tester.cpuDataPort = ruby_port.slave
+    elif ruby_port.support_inst_reqs:
+        tester.cpuInstPort = ruby_port.slave
+
+    # Do not automatically retry stalled Ruby requests
+    ruby_port.no_retry_on_stall = True
+
+    #
+    # Tell each sequencer this is the ruby tester so that it
+    # copies the subblock back to the checker
+    #
+    ruby_port.using_ruby_tester = True
+
+# -----------------------
+# run simulation
+# -----------------------
+
+root = Root( full_system = False, system = system )
+root.system.mem_mode = 'timing'
+
+# Not much point in this being higher than the L1 latency
+m5.ticks.setGlobalFrequency('1ns')
+
+# instantiate configuration
+m5.instantiate()
+
+# simulate until program terminates
+exit_event = m5.simulate(options.abs_max_tick)
+
+print 'Exiting @ tick', m5.curTick(), 'because', exit_event.getCause()
diff --git a/configs/ruby/AMD_Base_Constructor.py b/configs/ruby/AMD_Base_Constructor.py
new file mode 100644
index 000000000..d13153e9a
--- /dev/null
+++ b/configs/ruby/AMD_Base_Constructor.py
@@ -0,0 +1,134 @@
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Sooraj Puthoor, Lisa Hsu
+#
+
+import math
+import m5
+from m5.objects import *
+from m5.defines import buildEnv
+from m5.util import convert
+from CntrlBase import *
+from Cluster import Cluster
+
+#
+# Note: the L1 Cache latency is only used by the sequencer on fast path hits
+#
+class L1Cache(RubyCache):
+    latency = 1
+    resourceStalls = False
+    def create(self, size, assoc, options):
+        self.size = MemorySize(size)
+        self.assoc = assoc
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+#
+# Note: the L2 Cache latency is not currently used
+#
+class L2Cache(RubyCache):
+    latency = 10
+    resourceStalls = False
+    def create(self, size, assoc, options):
+        self.size = MemorySize(size)
+        self.assoc = assoc
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+class CPCntrl(AMD_Base_Controller, CntrlBase):
+
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.cntrl_id = self.cntrlCount()
+
+        self.L1Icache = L1Cache()
+        self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
+        self.L1D0cache = L1Cache()
+        self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
+        self.L1D1cache = L1Cache()
+        self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
+        self.L2cache = L2Cache()
+        self.L2cache.create(options.l2_size, options.l2_assoc, options)
+
+        self.sequencer = RubySequencer()
+        self.sequencer.version = self.seqCount()
+        self.sequencer.icache = self.L1Icache
+        self.sequencer.dcache = self.L1D0cache
+        self.sequencer.ruby_system = ruby_system
+        self.sequencer.coreid = 0
+        self.sequencer.is_cpu_sequencer = True
+
+        self.sequencer1 = RubySequencer()
+        self.sequencer1.version = self.seqCount()
+        self.sequencer1.icache = self.L1Icache
+        self.sequencer1.dcache = self.L1D1cache
+        self.sequencer1.ruby_system = ruby_system
+        self.sequencer1.coreid = 1
+        self.sequencer1.is_cpu_sequencer = True
+
+        self.issue_latency = options.cpu_to_dir_latency
+        self.send_evictions = send_evicts(options)
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+def define_options(parser):
+    parser.add_option("--cpu-to-dir-latency", type="int", default=15)
+
+def construct(options, system, ruby_system):
+    if (buildEnv['PROTOCOL'] != 'GPU_VIPER' or
+        buildEnv['PROTOCOL'] != 'GPU_VIPER_Region' or
+        buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline'):
+        panic("This script requires VIPER based protocols \
+        to be built.")
+    cpu_sequencers = []
+    cpuCluster = None
+    cpuCluster = Cluster(name="CPU Cluster", extBW = 8, intBW=8) # 16 GB/s
+    for i in xrange((options.num_cpus + 1) / 2):
+
+        cp_cntrl = CPCntrl()
+        cp_cntrl.create(options, ruby_system, system)
+
+        # Connect the CP controllers to the ruby network
+        cp_cntrl.requestFromCore = ruby_system.network.slave
+        cp_cntrl.responseFromCore = ruby_system.network.slave
+        cp_cntrl.unblockFromCore = ruby_system.network.slave
+        cp_cntrl.probeToCore = ruby_system.network.master
+        cp_cntrl.responseToCore = ruby_system.network.master
+
+        exec("system.cp_cntrl%d = cp_cntrl" % i)
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
+        cpuCluster.add(cp_cntrl)
+    return cpu_sequencers, cpuCluster
diff --git a/configs/ruby/GPU_RfO.py b/configs/ruby/GPU_RfO.py
new file mode 100644
index 000000000..bb14252f3
--- /dev/null
+++ b/configs/ruby/GPU_RfO.py
@@ -0,0 +1,751 @@
+#
+#  Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Lisa Hsu
+#
+
+import math
+import m5
+from m5.objects import *
+from m5.defines import buildEnv
+from Ruby import create_topology
+from Ruby import send_evicts
+
+from Cluster import Cluster
+from Crossbar import Crossbar
+
+class CntrlBase:
+    _seqs = 0
+    @classmethod
+    def seqCount(cls):
+        # Use SeqCount not class since we need global count
+        CntrlBase._seqs += 1
+        return CntrlBase._seqs - 1
+
+    _cntrls = 0
+    @classmethod
+    def cntrlCount(cls):
+        # Use CntlCount not class since we need global count
+        CntrlBase._cntrls += 1
+        return CntrlBase._cntrls - 1
+
+    _version = 0
+    @classmethod
+    def versionCount(cls):
+        cls._version += 1 # Use count for this particular type
+        return cls._version - 1
+
+class TccDirCache(RubyCache):
+    size = "512kB"
+    assoc = 16
+    resourceStalls = False
+    def create(self, options):
+        self.size = MemorySize(options.tcc_size)
+        self.size.value += (options.num_compute_units *
+                            (MemorySize(options.tcp_size).value) *
+                            options.tcc_dir_factor) / long(options.num_tccs)
+        self.start_index_bit = math.log(options.cacheline_size, 2) + \
+                               math.log(options.num_tccs, 2)
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class L1DCache(RubyCache):
+    resourceStalls = False
+    def create(self, options):
+        self.size = MemorySize(options.l1d_size)
+        self.assoc = options.l1d_assoc
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class L1ICache(RubyCache):
+    resourceStalls = False
+    def create(self, options):
+        self.size = MemorySize(options.l1i_size)
+        self.assoc = options.l1i_assoc
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class L2Cache(RubyCache):
+    resourceStalls = False
+    def create(self, options):
+        self.size = MemorySize(options.l2_size)
+        self.assoc = options.l2_assoc
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+
+class CPCntrl(CorePair_Controller, CntrlBase):
+
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        self.L1Icache = L1ICache()
+        self.L1Icache.create(options)
+        self.L1D0cache = L1DCache()
+        self.L1D0cache.create(options)
+        self.L1D1cache = L1DCache()
+        self.L1D1cache.create(options)
+        self.L2cache = L2Cache()
+        self.L2cache.create(options)
+
+        self.sequencer = RubySequencer()
+        self.sequencer.icache_hit_latency = 2
+        self.sequencer.dcache_hit_latency = 2
+        self.sequencer.version = self.seqCount()
+        self.sequencer.icache = self.L1Icache
+        self.sequencer.dcache = self.L1D0cache
+        self.sequencer.ruby_system = ruby_system
+        self.sequencer.coreid = 0
+        self.sequencer.is_cpu_sequencer = True
+
+        self.sequencer1 = RubySequencer()
+        self.sequencer1.version = self.seqCount()
+        self.sequencer1.icache = self.L1Icache
+        self.sequencer1.dcache = self.L1D1cache
+        self.sequencer1.icache_hit_latency = 2
+        self.sequencer1.dcache_hit_latency = 2
+        self.sequencer1.ruby_system = ruby_system
+        self.sequencer1.coreid = 1
+        self.sequencer1.is_cpu_sequencer = True
+
+        self.issue_latency = options.cpu_to_dir_latency
+        self.send_evictions = send_evicts(options)
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+class TCPCache(RubyCache):
+    assoc = 8
+    dataArrayBanks = 16
+    tagArrayBanks = 4
+    dataAccessLatency = 4
+    tagAccessLatency = 1
+    def create(self, options):
+        self.size = MemorySize(options.tcp_size)
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class TCPCntrl(TCP_Controller, CntrlBase):
+
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency)
+        self.L1cache.resourceStalls = options.no_resource_stalls
+        self.L1cache.create(options)
+
+        self.coalescer = RubyGPUCoalescer()
+        self.coalescer.version = self.seqCount()
+        self.coalescer.icache = self.L1cache
+        self.coalescer.dcache = self.L1cache
+        self.coalescer.ruby_system = ruby_system
+        self.coalescer.support_inst_reqs = False
+        self.coalescer.is_cpu_sequencer = False
+        self.coalescer.max_outstanding_requests = options.simds_per_cu * \
+                                                  options.wfs_per_simd * \
+                                                  options.wf_size
+
+        self.sequencer = RubySequencer()
+        self.sequencer.version = self.seqCount()
+        self.sequencer.icache = self.L1cache
+        self.sequencer.dcache = self.L1cache
+        self.sequencer.ruby_system = ruby_system
+        self.sequencer.is_cpu_sequencer = True
+
+        self.use_seq_not_coal = False
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+    def createCP(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency)
+        self.L1cache.resourceStalls = options.no_resource_stalls
+        self.L1cache.create(options)
+
+        self.coalescer = RubyGPUCoalescer()
+        self.coalescer.version = self.seqCount()
+        self.coalescer.icache = self.L1cache
+        self.coalescer.dcache = self.L1cache
+        self.coalescer.ruby_system = ruby_system
+        self.coalescer.support_inst_reqs = False
+        self.coalescer.is_cpu_sequencer = False
+
+        self.sequencer = RubySequencer()
+        self.sequencer.version = self.seqCount()
+        self.sequencer.icache = self.L1cache
+        self.sequencer.dcache = self.L1cache
+        self.sequencer.ruby_system = ruby_system
+        self.sequencer.is_cpu_sequencer = True
+
+        self.use_seq_not_coal = True
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+class SQCCache(RubyCache):
+    size = "32kB"
+    assoc = 8
+    dataArrayBanks = 16
+    tagArrayBanks = 4
+    dataAccessLatency = 4
+    tagAccessLatency = 1
+    def create(self, options):
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class SQCCntrl(SQC_Controller, CntrlBase):
+
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        self.L1cache = SQCCache()
+        self.L1cache.create(options)
+        self.L1cache.resourceStalls = options.no_resource_stalls
+
+        self.sequencer = RubySequencer()
+
+        self.sequencer.version = self.seqCount()
+        self.sequencer.icache = self.L1cache
+        self.sequencer.dcache = self.L1cache
+        self.sequencer.ruby_system = ruby_system
+        self.sequencer.support_data_reqs = False
+        self.sequencer.is_cpu_sequencer = False
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+    def createCP(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        self.L1cache = SQCCache()
+        self.L1cache.create(options)
+        self.L1cache.resourceStalls = options.no_resource_stalls
+
+        self.sequencer = RubySequencer()
+
+        self.sequencer.version = self.seqCount()
+        self.sequencer.icache = self.L1cache
+        self.sequencer.dcache = self.L1cache
+        self.sequencer.ruby_system = ruby_system
+        self.sequencer.support_data_reqs = False
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+
+class TCC(RubyCache):
+    assoc = 16
+    dataAccessLatency = 8
+    tagAccessLatency = 2
+    resourceStalls = True
+    def create(self, options):
+        self.size = MemorySize(options.tcc_size)
+        self.size = self.size / options.num_tccs
+        self.dataArrayBanks = 256 / options.num_tccs #number of data banks
+        self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
+        if ((self.size.value / long(self.assoc)) < 128):
+            self.size.value = long(128 * self.assoc)
+        self.start_index_bit = math.log(options.cacheline_size, 2) + \
+                               math.log(options.num_tccs, 2)
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class TCCCntrl(TCC_Controller, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.L2cache = TCC()
+        self.L2cache.create(options)
+        self.l2_response_latency = options.TCC_latency
+
+        self.number_of_TBEs = 2048
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+    def connectWireBuffers(self, req_to_tccdir, resp_to_tccdir,
+                           tcc_unblock_to_tccdir, req_to_tcc,
+                           probe_to_tcc, resp_to_tcc):
+        self.w_reqToTCCDir = req_to_tccdir
+        self.w_respToTCCDir = resp_to_tccdir
+        self.w_TCCUnblockToTCCDir = tcc_unblock_to_tccdir
+        self.w_reqToTCC = req_to_tcc
+        self.w_probeToTCC = probe_to_tcc
+        self.w_respToTCC = resp_to_tcc
+
+class TCCDirCntrl(TCCdir_Controller, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        self.directory = TccDirCache()
+        self.directory.create(options)
+
+        self.number_of_TBEs = 1024
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+    def connectWireBuffers(self, req_to_tccdir, resp_to_tccdir,
+                           tcc_unblock_to_tccdir, req_to_tcc,
+                           probe_to_tcc, resp_to_tcc):
+        self.w_reqToTCCDir = req_to_tccdir
+        self.w_respToTCCDir = resp_to_tccdir
+        self.w_TCCUnblockToTCCDir = tcc_unblock_to_tccdir
+        self.w_reqToTCC = req_to_tcc
+        self.w_probeToTCC = probe_to_tcc
+        self.w_respToTCC = resp_to_tcc
+
+class L3Cache(RubyCache):
+    assoc = 8
+    dataArrayBanks = 256
+    tagArrayBanks = 256
+
+    def create(self, options, ruby_system, system):
+        self.size = MemorySize(options.l3_size)
+        self.size.value /= options.num_dirs
+        self.dataArrayBanks /= options.num_dirs
+        self.tagArrayBanks /= options.num_dirs
+        self.dataArrayBanks /= options.num_dirs
+        self.tagArrayBanks /= options.num_dirs
+        self.dataAccessLatency = options.l3_data_latency
+        self.tagAccessLatency = options.l3_tag_latency
+        self.resourceStalls = options.no_resource_stalls
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class L3Cntrl(L3Cache_Controller, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.L3cache = L3Cache()
+        self.L3cache.create(options, ruby_system, system)
+
+        self.l3_response_latency = max(self.L3cache.dataAccessLatency,
+                                       self.L3cache.tagAccessLatency)
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
+                           req_to_l3, probe_to_l3, resp_to_l3):
+        self.reqToDir = req_to_dir
+        self.respToDir = resp_to_dir
+        self.l3UnblockToDir = l3_unblock_to_dir
+        self.reqToL3 = req_to_l3
+        self.probeToL3 = probe_to_l3
+        self.respToL3 = resp_to_l3
+
+class DirMem(RubyDirectoryMemory, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        phys_mem_size = AddrRange(options.mem_size).size()
+        mem_module_size = phys_mem_size / options.num_dirs
+        dir_size = MemorySize('0B')
+        dir_size.value = mem_module_size
+        self.size = dir_size
+
+class DirCntrl(Directory_Controller, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        self.response_latency = 30
+
+        self.directory = DirMem()
+        self.directory.create(options, ruby_system, system)
+
+        self.L3CacheMemory = L3Cache()
+        self.L3CacheMemory.create(options, ruby_system, system)
+
+        self.l3_hit_latency = max(self.L3CacheMemory.dataAccessLatency,
+                                  self.L3CacheMemory.tagAccessLatency)
+
+        self.number_of_TBEs = options.num_tbes
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
+                           req_to_l3, probe_to_l3, resp_to_l3):
+        self.reqToDir = req_to_dir
+        self.respToDir = resp_to_dir
+        self.l3UnblockToDir = l3_unblock_to_dir
+        self.reqToL3 = req_to_l3
+        self.probeToL3 = probe_to_l3
+        self.respToL3 = resp_to_l3
+
+
+
+def define_options(parser):
+    parser.add_option("--num-subcaches", type="int", default=4)
+    parser.add_option("--l3-data-latency", type="int", default=20)
+    parser.add_option("--l3-tag-latency", type="int", default=15)
+    parser.add_option("--cpu-to-dir-latency", type="int", default=15)
+    parser.add_option("--gpu-to-dir-latency", type="int", default=160)
+    parser.add_option("--no-resource-stalls", action="store_false",
+                      default=True)
+    parser.add_option("--num-tbes", type="int", default=256)
+    parser.add_option("--l2-latency", type="int", default=50) # load to use
+    parser.add_option("--num-tccs", type="int", default=1,
+                      help="number of TCC directories and banks in the GPU")
+    parser.add_option("--TCP_latency", type="int", default=4,
+                      help="TCP latency")
+    parser.add_option("--TCC_latency", type="int", default=16,
+                      help="TCC latency")
+    parser.add_option("--tcc-size", type='string', default='256kB',
+                      help="agregate tcc size")
+    parser.add_option("--tcp-size", type='string', default='16kB',
+                      help="tcp size")
+    parser.add_option("--tcc-dir-factor", type='int', default=4,
+                      help="TCCdir size = factor *(TCPs + TCC)")
+
+def create_system(options, full_system, system, dma_devices, ruby_system):
+    if buildEnv['PROTOCOL'] != 'GPU_RfO':
+        panic("This script requires the GPU_RfO protocol to be built.")
+
+    cpu_sequencers = []
+
+    #
+    # The ruby network creation expects the list of nodes in the system to be
+    # consistent with the NetDest list.  Therefore the l1 controller nodes
+    # must be listed before the directory nodes and directory nodes before
+    # dma nodes, etc.
+    #
+    cp_cntrl_nodes = []
+    tcp_cntrl_nodes = []
+    sqc_cntrl_nodes = []
+    tcc_cntrl_nodes = []
+    tccdir_cntrl_nodes = []
+    dir_cntrl_nodes = []
+    l3_cntrl_nodes = []
+
+    #
+    # Must create the individual controllers before the network to ensure the
+    # controller constructors are called before the network constructor
+    #
+
+    TCC_bits = int(math.log(options.num_tccs, 2))
+
+    # This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
+    # Clusters
+    mainCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s
+    for i in xrange(options.num_dirs):
+
+        dir_cntrl = DirCntrl(TCC_select_num_bits = TCC_bits)
+        dir_cntrl.create(options, ruby_system, system)
+        dir_cntrl.number_of_TBEs = 2560 * options.num_compute_units
+        #Enough TBEs for all TCP TBEs
+
+        # Connect the Directory controller to the ruby network
+        dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
+        dir_cntrl.requestFromCores.slave = ruby_system.network.master
+
+        dir_cntrl.responseFromCores = MessageBuffer()
+        dir_cntrl.responseFromCores.slave = ruby_system.network.master
+
+        dir_cntrl.unblockFromCores = MessageBuffer()
+        dir_cntrl.unblockFromCores.slave = ruby_system.network.master
+
+        dir_cntrl.probeToCore = MessageBuffer()
+        dir_cntrl.probeToCore.master = ruby_system.network.slave
+
+        dir_cntrl.responseToCore = MessageBuffer()
+        dir_cntrl.responseToCore.master = ruby_system.network.slave
+
+        dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
+        dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
+        dir_cntrl.responseFromMemory = MessageBuffer()
+
+        exec("system.dir_cntrl%d = dir_cntrl" % i)
+        dir_cntrl_nodes.append(dir_cntrl)
+
+        mainCluster.add(dir_cntrl)
+
+    # For an odd number of CPUs, still create the right number of controllers
+    cpuCluster = Cluster(extBW = 512, intBW = 512)  # 1 TB/s
+    for i in xrange((options.num_cpus + 1) / 2):
+
+        cp_cntrl = CPCntrl()
+        cp_cntrl.create(options, ruby_system, system)
+
+        exec("system.cp_cntrl%d = cp_cntrl" % i)
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
+
+        # Connect the CP controllers and the network
+        cp_cntrl.requestFromCore = MessageBuffer()
+        cp_cntrl.requestFromCore.master = ruby_system.network.slave
+
+        cp_cntrl.responseFromCore = MessageBuffer()
+        cp_cntrl.responseFromCore.master = ruby_system.network.slave
+
+        cp_cntrl.unblockFromCore = MessageBuffer()
+        cp_cntrl.unblockFromCore.master = ruby_system.network.slave
+
+        cp_cntrl.probeToCore = MessageBuffer()
+        cp_cntrl.probeToCore.slave = ruby_system.network.master
+
+        cp_cntrl.responseToCore = MessageBuffer()
+        cp_cntrl.responseToCore.slave = ruby_system.network.master
+
+        cp_cntrl.mandatoryQueue = MessageBuffer()
+        cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
+
+        cpuCluster.add(cp_cntrl)
+
+    gpuCluster = Cluster(extBW = 512, intBW = 512)  # 1 TB/s
+
+    for i in xrange(options.num_compute_units):
+
+        tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
+                             number_of_TBEs = 2560) # max outstanding requests
+        tcp_cntrl.create(options, ruby_system, system)
+
+        exec("system.tcp_cntrl%d = tcp_cntrl" % i)
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.append(tcp_cntrl.coalescer)
+        tcp_cntrl_nodes.append(tcp_cntrl)
+
+        # Connect the TCP controller to the ruby network
+        tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
+
+        tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
+
+        tcp_cntrl.unblockFromCore = MessageBuffer(ordered = True)
+        tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
+
+        tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.probeToTCP.slave = ruby_system.network.master
+
+        tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.responseToTCP.slave = ruby_system.network.master
+
+        tcp_cntrl.mandatoryQueue = MessageBuffer()
+
+        gpuCluster.add(tcp_cntrl)
+
+    for i in xrange(options.num_sqc):
+
+        sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
+        sqc_cntrl.create(options, ruby_system, system)
+
+        exec("system.sqc_cntrl%d = sqc_cntrl" % i)
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.append(sqc_cntrl.sequencer)
+
+        # Connect the SQC controller to the ruby network
+        sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
+        sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
+
+        sqc_cntrl.responseFromSQC = MessageBuffer(ordered = True)
+        sqc_cntrl.responseFromSQC.master = ruby_system.network.slave
+
+        sqc_cntrl.unblockFromCore = MessageBuffer(ordered = True)
+        sqc_cntrl.unblockFromCore.master = ruby_system.network.slave
+
+        sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
+        sqc_cntrl.probeToSQC.slave = ruby_system.network.master
+
+        sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
+        sqc_cntrl.responseToSQC.slave = ruby_system.network.master
+
+        sqc_cntrl.mandatoryQueue = MessageBuffer()
+
+        # SQC also in GPU cluster
+        gpuCluster.add(sqc_cntrl)
+
+    for i in xrange(options.numCPs):
+
+        tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
+                             number_of_TBEs = 2560) # max outstanding requests
+        tcp_cntrl.createCP(options, ruby_system, system)
+
+        exec("system.tcp_cntrl%d = tcp_cntrl" % (options.num_compute_units + i))
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.append(tcp_cntrl.sequencer)
+        tcp_cntrl_nodes.append(tcp_cntrl)
+
+        # Connect the TCP controller to the ruby network
+        tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
+
+        tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
+
+        tcp_cntrl.unblockFromCore = MessageBuffer(ordered = True)
+        tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
+
+        tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.probeToTCP.slave = ruby_system.network.master
+
+        tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.responseToTCP.slave = ruby_system.network.master
+
+        tcp_cntrl.mandatoryQueue = MessageBuffer()
+
+        gpuCluster.add(tcp_cntrl)
+
+        sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
+        sqc_cntrl.createCP(options, ruby_system, system)
+
+        exec("system.sqc_cntrl%d = sqc_cntrl" % (options.num_compute_units + i))
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.append(sqc_cntrl.sequencer)
+
+        # Connect the SQC controller to the ruby network
+        sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
+        sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
+
+        sqc_cntrl.responseFromSQC = MessageBuffer(ordered = True)
+        sqc_cntrl.responseFromSQC.master = ruby_system.network.slave
+
+        sqc_cntrl.unblockFromCore = MessageBuffer(ordered = True)
+        sqc_cntrl.unblockFromCore.master = ruby_system.network.slave
+
+        sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
+        sqc_cntrl.probeToSQC.slave = ruby_system.network.master
+
+        sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
+        sqc_cntrl.responseToSQC.slave = ruby_system.network.master
+
+        sqc_cntrl.mandatoryQueue = MessageBuffer()
+
+        # SQC also in GPU cluster
+        gpuCluster.add(sqc_cntrl)
+
+    for i in xrange(options.num_tccs):
+
+        tcc_cntrl = TCCCntrl(TCC_select_num_bits = TCC_bits,
+                             number_of_TBEs = options.num_compute_units * 2560)
+        #Enough TBEs for all TCP TBEs
+        tcc_cntrl.create(options, ruby_system, system)
+        tcc_cntrl_nodes.append(tcc_cntrl)
+
+        tccdir_cntrl = TCCDirCntrl(TCC_select_num_bits = TCC_bits,
+                              number_of_TBEs = options.num_compute_units * 2560)
+        #Enough TBEs for all TCP TBEs
+        tccdir_cntrl.create(options, ruby_system, system)
+        tccdir_cntrl_nodes.append(tccdir_cntrl)
+
+        exec("system.tcc_cntrl%d = tcc_cntrl" % i)
+        exec("system.tccdir_cntrl%d = tccdir_cntrl" % i)
+
+        # connect all of the wire buffers between L3 and dirs up
+        req_to_tccdir = RubyWireBuffer()
+        resp_to_tccdir = RubyWireBuffer()
+        tcc_unblock_to_tccdir = RubyWireBuffer()
+        req_to_tcc = RubyWireBuffer()
+        probe_to_tcc = RubyWireBuffer()
+        resp_to_tcc = RubyWireBuffer()
+
+        tcc_cntrl.connectWireBuffers(req_to_tccdir, resp_to_tccdir,
+                                     tcc_unblock_to_tccdir, req_to_tcc,
+                                     probe_to_tcc, resp_to_tcc)
+        tccdir_cntrl.connectWireBuffers(req_to_tccdir, resp_to_tccdir,
+                                        tcc_unblock_to_tccdir, req_to_tcc,
+                                        probe_to_tcc, resp_to_tcc)
+
+        # Connect the TCC controller to the ruby network
+        tcc_cntrl.responseFromTCC = MessageBuffer(ordered = True)
+        tcc_cntrl.responseFromTCC.master = ruby_system.network.slave
+
+        tcc_cntrl.responseToTCC = MessageBuffer(ordered = True)
+        tcc_cntrl.responseToTCC.slave = ruby_system.network.master
+
+        # Connect the TCC Dir controller to the ruby network
+        tccdir_cntrl.requestFromTCP = MessageBuffer(ordered = True)
+        tccdir_cntrl.requestFromTCP.slave = ruby_system.network.master
+
+        tccdir_cntrl.responseFromTCP = MessageBuffer(ordered = True)
+        tccdir_cntrl.responseFromTCP.slave = ruby_system.network.master
+
+        tccdir_cntrl.unblockFromTCP = MessageBuffer(ordered = True)
+        tccdir_cntrl.unblockFromTCP.slave = ruby_system.network.master
+
+        tccdir_cntrl.probeToCore = MessageBuffer(ordered = True)
+        tccdir_cntrl.probeToCore.master = ruby_system.network.slave
+
+        tccdir_cntrl.responseToCore = MessageBuffer(ordered = True)
+        tccdir_cntrl.responseToCore.master = ruby_system.network.slave
+
+        tccdir_cntrl.probeFromNB = MessageBuffer()
+        tccdir_cntrl.probeFromNB.slave = ruby_system.network.master
+
+        tccdir_cntrl.responseFromNB = MessageBuffer()
+        tccdir_cntrl.responseFromNB.slave = ruby_system.network.master
+
+        tccdir_cntrl.requestToNB = MessageBuffer()
+        tccdir_cntrl.requestToNB.master = ruby_system.network.slave
+
+        tccdir_cntrl.responseToNB = MessageBuffer()
+        tccdir_cntrl.responseToNB.master = ruby_system.network.slave
+
+        tccdir_cntrl.unblockToNB = MessageBuffer()
+        tccdir_cntrl.unblockToNB.master = ruby_system.network.slave
+
+        tccdir_cntrl.triggerQueue = MessageBuffer(ordered = True)
+
+        # TCC cntrls added to the GPU cluster
+        gpuCluster.add(tcc_cntrl)
+        gpuCluster.add(tccdir_cntrl)
+
+    # Assuming no DMA devices
+    assert(len(dma_devices) == 0)
+
+    # Add cpu/gpu clusters to main cluster
+    mainCluster.add(cpuCluster)
+    mainCluster.add(gpuCluster)
+
+    ruby_system.network.number_of_virtual_networks = 10
+
+    return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
diff --git a/configs/ruby/GPU_VIPER.py b/configs/ruby/GPU_VIPER.py
new file mode 100644
index 000000000..f1384c404
--- /dev/null
+++ b/configs/ruby/GPU_VIPER.py
@@ -0,0 +1,674 @@
+#
+#  Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Lisa Hsu
+#
+
+import math
+import m5
+from m5.objects import *
+from m5.defines import buildEnv
+from Ruby import create_topology
+from Ruby import send_evicts
+
+from Cluster import Cluster
+from Crossbar import Crossbar
+
+class CntrlBase:
+    _seqs = 0
+    @classmethod
+    def seqCount(cls):
+        # Use SeqCount not class since we need global count
+        CntrlBase._seqs += 1
+        return CntrlBase._seqs - 1
+
+    _cntrls = 0
+    @classmethod
+    def cntrlCount(cls):
+        # Use CntlCount not class since we need global count
+        CntrlBase._cntrls += 1
+        return CntrlBase._cntrls - 1
+
+    _version = 0
+    @classmethod
+    def versionCount(cls):
+        cls._version += 1 # Use count for this particular type
+        return cls._version - 1
+
+class L1Cache(RubyCache):
+    resourceStalls = False
+    dataArrayBanks = 2
+    tagArrayBanks = 2
+    dataAccessLatency = 1
+    tagAccessLatency = 1
+    def create(self, size, assoc, options):
+        self.size = MemorySize(size)
+        self.assoc = assoc
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class L2Cache(RubyCache):
+    resourceStalls = False
+    assoc = 16
+    dataArrayBanks = 16
+    tagArrayBanks = 16
+    def create(self, size, assoc, options):
+        self.size = MemorySize(size)
+        self.assoc = assoc
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class CPCntrl(CorePair_Controller, CntrlBase):
+
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        self.L1Icache = L1Cache()
+        self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
+        self.L1D0cache = L1Cache()
+        self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
+        self.L1D1cache = L1Cache()
+        self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
+        self.L2cache = L2Cache()
+        self.L2cache.create(options.l2_size, options.l2_assoc, options)
+
+        self.sequencer = RubySequencer()
+        self.sequencer.version = self.seqCount()
+        self.sequencer.icache = self.L1Icache
+        self.sequencer.dcache = self.L1D0cache
+        self.sequencer.ruby_system = ruby_system
+        self.sequencer.coreid = 0
+        self.sequencer.is_cpu_sequencer = True
+
+        self.sequencer1 = RubySequencer()
+        self.sequencer1.version = self.seqCount()
+        self.sequencer1.icache = self.L1Icache
+        self.sequencer1.dcache = self.L1D1cache
+        self.sequencer1.ruby_system = ruby_system
+        self.sequencer1.coreid = 1
+        self.sequencer1.is_cpu_sequencer = True
+
+        self.issue_latency = options.cpu_to_dir_latency
+        self.send_evictions = send_evicts(options)
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+class TCPCache(RubyCache):
+    size = "16kB"
+    assoc = 16
+    dataArrayBanks = 16 #number of data banks
+    tagArrayBanks = 16  #number of tag banks
+    dataAccessLatency = 4
+    tagAccessLatency = 1
+    def create(self, options):
+        self.size = MemorySize(options.tcp_size)
+        self.assoc = options.tcp_assoc
+        self.resourceStalls = options.no_tcc_resource_stalls
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class TCPCntrl(TCP_Controller, CntrlBase):
+
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency,
+                                dataAccessLatency = options.TCP_latency)
+        self.L1cache.resourceStalls = options.no_resource_stalls
+        self.L1cache.create(options)
+        self.issue_latency = 1
+
+        self.coalescer = VIPERCoalescer()
+        self.coalescer.version = self.seqCount()
+        self.coalescer.icache = self.L1cache
+        self.coalescer.dcache = self.L1cache
+        self.coalescer.ruby_system = ruby_system
+        self.coalescer.support_inst_reqs = False
+        self.coalescer.is_cpu_sequencer = False
+
+        self.sequencer = RubySequencer()
+        self.sequencer.version = self.seqCount()
+        self.sequencer.icache = self.L1cache
+        self.sequencer.dcache = self.L1cache
+        self.sequencer.ruby_system = ruby_system
+        self.sequencer.is_cpu_sequencer = True
+
+        self.use_seq_not_coal = False
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+    def createCP(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency,
+                                dataAccessLatency = options.TCP_latency)
+        self.L1cache.resourceStalls = options.no_resource_stalls
+        self.L1cache.create(options)
+        self.issue_latency = 1
+
+        self.coalescer = VIPERCoalescer()
+        self.coalescer.version = self.seqCount()
+        self.coalescer.icache = self.L1cache
+        self.coalescer.dcache = self.L1cache
+        self.coalescer.ruby_system = ruby_system
+        self.coalescer.support_inst_reqs = False
+        self.coalescer.is_cpu_sequencer = False
+
+        self.sequencer = RubySequencer()
+        self.sequencer.version = self.seqCount()
+        self.sequencer.icache = self.L1cache
+        self.sequencer.dcache = self.L1cache
+        self.sequencer.ruby_system = ruby_system
+        self.sequencer.is_cpu_sequencer = True
+
+        self.use_seq_not_coal = True
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+class SQCCache(RubyCache):
+    dataArrayBanks = 8
+    tagArrayBanks = 8
+    dataAccessLatency = 1
+    tagAccessLatency = 1
+
+    def create(self, options):
+        self.size = MemorySize(options.sqc_size)
+        self.assoc = options.sqc_assoc
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class SQCCntrl(SQC_Controller, CntrlBase):
+
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        self.L1cache = SQCCache()
+        self.L1cache.create(options)
+        self.L1cache.resourceStalls = options.no_resource_stalls
+
+        self.sequencer = RubySequencer()
+
+        self.sequencer.version = self.seqCount()
+        self.sequencer.icache = self.L1cache
+        self.sequencer.dcache = self.L1cache
+        self.sequencer.ruby_system = ruby_system
+        self.sequencer.support_data_reqs = False
+        self.sequencer.is_cpu_sequencer = False
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+class TCC(RubyCache):
+    size = MemorySize("256kB")
+    assoc = 16
+    dataAccessLatency = 8
+    tagAccessLatency = 2
+    resourceStalls = True
+    def create(self, options):
+        self.assoc = options.tcc_assoc
+        if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
+          s = options.num_compute_units
+          tcc_size = s * 128
+          tcc_size = str(tcc_size)+'kB'
+          self.size = MemorySize(tcc_size)
+          self.dataArrayBanks = 64
+          self.tagArrayBanks = 64
+        else:
+          self.size = MemorySize(options.tcc_size)
+          self.dataArrayBanks = 256 / options.num_tccs #number of data banks
+          self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
+        self.size.value = self.size.value / options.num_tccs
+        if ((self.size.value / long(self.assoc)) < 128):
+            self.size.value = long(128 * self.assoc)
+        self.start_index_bit = math.log(options.cacheline_size, 2) + \
+                               math.log(options.num_tccs, 2)
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+
+class TCCCntrl(TCC_Controller, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.L2cache = TCC()
+        self.L2cache.create(options)
+        self.L2cache.resourceStalls = options.no_tcc_resource_stalls
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+class L3Cache(RubyCache):
+    dataArrayBanks = 16
+    tagArrayBanks = 16
+
+    def create(self, options, ruby_system, system):
+        self.size = MemorySize(options.l3_size)
+        self.size.value /= options.num_dirs
+        self.assoc = options.l3_assoc
+        self.dataArrayBanks /= options.num_dirs
+        self.tagArrayBanks /= options.num_dirs
+        self.dataArrayBanks /= options.num_dirs
+        self.tagArrayBanks /= options.num_dirs
+        self.dataAccessLatency = options.l3_data_latency
+        self.tagAccessLatency = options.l3_tag_latency
+        self.resourceStalls = False
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class L3Cntrl(L3Cache_Controller, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.L3cache = L3Cache()
+        self.L3cache.create(options, ruby_system, system)
+
+        self.l3_response_latency = max(self.L3cache.dataAccessLatency, self.L3cache.tagAccessLatency)
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
+                           req_to_l3, probe_to_l3, resp_to_l3):
+        self.reqToDir = req_to_dir
+        self.respToDir = resp_to_dir
+        self.l3UnblockToDir = l3_unblock_to_dir
+        self.reqToL3 = req_to_l3
+        self.probeToL3 = probe_to_l3
+        self.respToL3 = resp_to_l3
+
+class DirMem(RubyDirectoryMemory, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        phys_mem_size = AddrRange(options.mem_size).size()
+        mem_module_size = phys_mem_size / options.num_dirs
+        dir_size = MemorySize('0B')
+        dir_size.value = mem_module_size
+        self.size = dir_size
+
+class DirCntrl(Directory_Controller, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        self.response_latency = 30
+
+        self.directory = DirMem()
+        self.directory.create(options, ruby_system, system)
+
+        self.L3CacheMemory = L3Cache()
+        self.L3CacheMemory.create(options, ruby_system, system)
+
+        self.l3_hit_latency = max(self.L3CacheMemory.dataAccessLatency,
+                                  self.L3CacheMemory.tagAccessLatency)
+
+        self.number_of_TBEs = options.num_tbes
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
+                           req_to_l3, probe_to_l3, resp_to_l3):
+        self.reqToDir = req_to_dir
+        self.respToDir = resp_to_dir
+        self.l3UnblockToDir = l3_unblock_to_dir
+        self.reqToL3 = req_to_l3
+        self.probeToL3 = probe_to_l3
+        self.respToL3 = resp_to_l3
+
+def define_options(parser):
+    parser.add_option("--num-subcaches", type = "int", default = 4)
+    parser.add_option("--l3-data-latency", type = "int", default = 20)
+    parser.add_option("--l3-tag-latency", type = "int", default = 15)
+    parser.add_option("--cpu-to-dir-latency", type = "int", default = 120)
+    parser.add_option("--gpu-to-dir-latency", type = "int", default = 120)
+    parser.add_option("--no-resource-stalls", action = "store_false",
+                      default = True)
+    parser.add_option("--no-tcc-resource-stalls", action = "store_false",
+                      default = True)
+    parser.add_option("--use-L3-on-WT", action = "store_true", default = False)
+    parser.add_option("--num-tbes", type = "int", default = 256)
+    parser.add_option("--l2-latency", type = "int", default = 50)  # load to use
+    parser.add_option("--num-tccs", type = "int", default = 1,
+                      help = "number of TCC banks in the GPU")
+    parser.add_option("--sqc-size", type = 'string', default = '32kB',
+                      help = "SQC cache size")
+    parser.add_option("--sqc-assoc", type = 'int', default = 8,
+                      help = "SQC cache assoc")
+    parser.add_option("--WB_L1", action = "store_true", default = False,
+                      help = "writeback L1")
+    parser.add_option("--WB_L2", action = "store_true", default = False,
+                      help = "writeback L2")
+    parser.add_option("--TCP_latency", type = "int", default = 4,
+                      help = "TCP latency")
+    parser.add_option("--TCC_latency", type = "int", default = 16,
+                      help = "TCC latency")
+    parser.add_option("--tcc-size", type = 'string', default = '256kB',
+                      help = "agregate tcc size")
+    parser.add_option("--tcc-assoc", type = 'int', default = 16,
+                      help = "tcc assoc")
+    parser.add_option("--tcp-size", type = 'string', default = '16kB',
+                      help = "tcp size")
+    parser.add_option("--tcp-assoc", type = 'int', default = 16,
+                      help = "tcp assoc")
+    parser.add_option("--noL1", action = "store_true", default = False,
+                      help = "bypassL1")
+
+def create_system(options, full_system, system, dma_devices, ruby_system):
+    if buildEnv['PROTOCOL'] != 'GPU_VIPER':
+        panic("This script requires the GPU_VIPER protocol to be built.")
+
+    cpu_sequencers = []
+
+    #
+    # The ruby network creation expects the list of nodes in the system to be
+    # consistent with the NetDest list.  Therefore the l1 controller nodes
+    # must be listed before the directory nodes and directory nodes before
+    # dma nodes, etc.
+    #
+    cp_cntrl_nodes = []
+    tcp_cntrl_nodes = []
+    sqc_cntrl_nodes = []
+    tcc_cntrl_nodes = []
+    dir_cntrl_nodes = []
+    l3_cntrl_nodes = []
+
+    #
+    # Must create the individual controllers before the network to ensure the
+    # controller constructors are called before the network constructor
+    #
+
+    # For an odd number of CPUs, still create the right number of controllers
+    TCC_bits = int(math.log(options.num_tccs, 2))
+
+    # This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
+    # Clusters
+    crossbar_bw = None
+    mainCluster = None
+    if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
+        #Assuming a 2GHz clock
+        crossbar_bw = 16 * options.num_compute_units * options.bw_scalor
+        mainCluster = Cluster(intBW=crossbar_bw)
+    else:
+        mainCluster = Cluster(intBW=8) # 16 GB/s
+    for i in xrange(options.num_dirs):
+
+        dir_cntrl = DirCntrl(noTCCdir = True, TCC_select_num_bits = TCC_bits)
+        dir_cntrl.create(options, ruby_system, system)
+        dir_cntrl.number_of_TBEs = options.num_tbes
+        dir_cntrl.useL3OnWT = options.use_L3_on_WT
+        # the number_of_TBEs is inclusive of TBEs below
+
+        # Connect the Directory controller to the ruby network
+        dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
+        dir_cntrl.requestFromCores.slave = ruby_system.network.master
+
+        dir_cntrl.responseFromCores = MessageBuffer()
+        dir_cntrl.responseFromCores.slave = ruby_system.network.master
+
+        dir_cntrl.unblockFromCores = MessageBuffer()
+        dir_cntrl.unblockFromCores.slave = ruby_system.network.master
+
+        dir_cntrl.probeToCore = MessageBuffer()
+        dir_cntrl.probeToCore.master = ruby_system.network.slave
+
+        dir_cntrl.responseToCore = MessageBuffer()
+        dir_cntrl.responseToCore.master = ruby_system.network.slave
+
+        dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
+        dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
+        dir_cntrl.responseFromMemory = MessageBuffer()
+
+        exec("ruby_system.dir_cntrl%d = dir_cntrl" % i)
+        dir_cntrl_nodes.append(dir_cntrl)
+
+        mainCluster.add(dir_cntrl)
+
+    cpuCluster = None
+    if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
+        cpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw)
+    else:
+        cpuCluster = Cluster(extBW = 8, intBW = 8) # 16 GB/s
+    for i in xrange((options.num_cpus + 1) / 2):
+
+        cp_cntrl = CPCntrl()
+        cp_cntrl.create(options, ruby_system, system)
+
+        exec("ruby_system.cp_cntrl%d = cp_cntrl" % i)
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
+
+        # Connect the CP controllers and the network
+        cp_cntrl.requestFromCore = MessageBuffer()
+        cp_cntrl.requestFromCore.master = ruby_system.network.slave
+
+        cp_cntrl.responseFromCore = MessageBuffer()
+        cp_cntrl.responseFromCore.master = ruby_system.network.slave
+
+        cp_cntrl.unblockFromCore = MessageBuffer()
+        cp_cntrl.unblockFromCore.master = ruby_system.network.slave
+
+        cp_cntrl.probeToCore = MessageBuffer()
+        cp_cntrl.probeToCore.slave = ruby_system.network.master
+
+        cp_cntrl.responseToCore = MessageBuffer()
+        cp_cntrl.responseToCore.slave = ruby_system.network.master
+
+        cp_cntrl.mandatoryQueue = MessageBuffer()
+        cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
+
+        cpuCluster.add(cp_cntrl)
+
+    gpuCluster = None
+    if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
+      gpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw)
+    else:
+      gpuCluster = Cluster(extBW = 8, intBW = 8) # 16 GB/s
+    for i in xrange(options.num_compute_units):
+
+        tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
+                             issue_latency = 1,
+                             number_of_TBEs = 2560)
+        # TBEs set to max outstanding requests
+        tcp_cntrl.create(options, ruby_system, system)
+        tcp_cntrl.WB = options.WB_L1
+        tcp_cntrl.disableL1 = options.noL1
+        tcp_cntrl.L1cache.tagAccessLatency = options.TCP_latency
+        tcp_cntrl.L1cache.dataAccessLatency = options.TCP_latency
+
+        exec("ruby_system.tcp_cntrl%d = tcp_cntrl" % i)
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.append(tcp_cntrl.coalescer)
+        tcp_cntrl_nodes.append(tcp_cntrl)
+
+        # Connect the TCP controller to the ruby network
+        tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
+
+        tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
+
+        tcp_cntrl.unblockFromCore = MessageBuffer()
+        tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
+
+        tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.probeToTCP.slave = ruby_system.network.master
+
+        tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.responseToTCP.slave = ruby_system.network.master
+
+        tcp_cntrl.mandatoryQueue = MessageBuffer()
+
+        gpuCluster.add(tcp_cntrl)
+
+    for i in xrange(options.num_sqc):
+
+        sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
+        sqc_cntrl.create(options, ruby_system, system)
+
+        exec("ruby_system.sqc_cntrl%d = sqc_cntrl" % i)
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.append(sqc_cntrl.sequencer)
+
+        # Connect the SQC controller to the ruby network
+        sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
+        sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
+
+        sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
+        sqc_cntrl.probeToSQC.slave = ruby_system.network.master
+
+        sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
+        sqc_cntrl.responseToSQC.slave = ruby_system.network.master
+
+        sqc_cntrl.mandatoryQueue = MessageBuffer()
+
+        # SQC also in GPU cluster
+        gpuCluster.add(sqc_cntrl)
+
+    for i in xrange(options.numCPs):
+
+        tcp_ID = options.num_compute_units + i
+        sqc_ID = options.num_sqc + i
+
+        tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
+                             issue_latency = 1,
+                             number_of_TBEs = 2560)
+        # TBEs set to max outstanding requests
+        tcp_cntrl.createCP(options, ruby_system, system)
+        tcp_cntrl.WB = options.WB_L1
+        tcp_cntrl.disableL1 = options.noL1
+        tcp_cntrl.L1cache.tagAccessLatency = options.TCP_latency
+        tcp_cntrl.L1cache.dataAccessLatency = options.TCP_latency
+
+        exec("ruby_system.tcp_cntrl%d = tcp_cntrl" % tcp_ID)
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.append(tcp_cntrl.sequencer)
+        tcp_cntrl_nodes.append(tcp_cntrl)
+
+        # Connect the CP (TCP) controllers to the ruby network
+        tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
+
+        tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
+
+        tcp_cntrl.unblockFromCore = MessageBuffer(ordered = True)
+        tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
+
+        tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.probeToTCP.slave = ruby_system.network.master
+
+        tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.responseToTCP.slave = ruby_system.network.master
+
+        tcp_cntrl.mandatoryQueue = MessageBuffer()
+
+        gpuCluster.add(tcp_cntrl)
+
+        sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
+        sqc_cntrl.create(options, ruby_system, system)
+
+        exec("ruby_system.sqc_cntrl%d = sqc_cntrl" % sqc_ID)
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.append(sqc_cntrl.sequencer)
+
+        # SQC also in GPU cluster
+        gpuCluster.add(sqc_cntrl)
+
+    for i in xrange(options.num_tccs):
+
+        tcc_cntrl = TCCCntrl(l2_response_latency = options.TCC_latency)
+        tcc_cntrl.create(options, ruby_system, system)
+        tcc_cntrl.l2_request_latency = options.gpu_to_dir_latency
+        tcc_cntrl.l2_response_latency = options.TCC_latency
+        tcc_cntrl_nodes.append(tcc_cntrl)
+        tcc_cntrl.WB = options.WB_L2
+        tcc_cntrl.number_of_TBEs = 2560 * options.num_compute_units
+        # the number_of_TBEs is inclusive of TBEs below
+
+        # Connect the TCC controllers to the ruby network
+        tcc_cntrl.requestFromTCP = MessageBuffer(ordered = True)
+        tcc_cntrl.requestFromTCP.slave = ruby_system.network.master
+
+        tcc_cntrl.responseToCore = MessageBuffer(ordered = True)
+        tcc_cntrl.responseToCore.master = ruby_system.network.slave
+
+        tcc_cntrl.probeFromNB = MessageBuffer()
+        tcc_cntrl.probeFromNB.slave = ruby_system.network.master
+
+        tcc_cntrl.responseFromNB = MessageBuffer()
+        tcc_cntrl.responseFromNB.slave = ruby_system.network.master
+
+        tcc_cntrl.requestToNB = MessageBuffer(ordered = True)
+        tcc_cntrl.requestToNB.master = ruby_system.network.slave
+
+        tcc_cntrl.responseToNB = MessageBuffer()
+        tcc_cntrl.responseToNB.master = ruby_system.network.slave
+
+        tcc_cntrl.unblockToNB = MessageBuffer()
+        tcc_cntrl.unblockToNB.master = ruby_system.network.slave
+
+        tcc_cntrl.triggerQueue = MessageBuffer(ordered = True)
+
+        exec("ruby_system.tcc_cntrl%d = tcc_cntrl" % i)
+
+        # connect all of the wire buffers between L3 and dirs up
+        # TCC cntrls added to the GPU cluster
+        gpuCluster.add(tcc_cntrl)
+
+    # Assuming no DMA devices
+    assert(len(dma_devices) == 0)
+
+    # Add cpu/gpu clusters to main cluster
+    mainCluster.add(cpuCluster)
+    mainCluster.add(gpuCluster)
+
+    ruby_system.network.number_of_virtual_networks = 10
+
+    return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
diff --git a/configs/ruby/GPU_VIPER_Baseline.py b/configs/ruby/GPU_VIPER_Baseline.py
new file mode 100644
index 000000000..879b34e88
--- /dev/null
+++ b/configs/ruby/GPU_VIPER_Baseline.py
@@ -0,0 +1,588 @@
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Sooraj Puthoor
+#
+
+import math
+import m5
+from m5.objects import *
+from m5.defines import buildEnv
+from Ruby import create_topology
+from Ruby import send_evicts
+
+from Cluster import Cluster
+from Crossbar import Crossbar
+
+class CntrlBase:
+    _seqs = 0
+    @classmethod
+    def seqCount(cls):
+        # Use SeqCount not class since we need global count
+        CntrlBase._seqs += 1
+        return CntrlBase._seqs - 1
+
+    _cntrls = 0
+    @classmethod
+    def cntrlCount(cls):
+        # Use CntlCount not class since we need global count
+        CntrlBase._cntrls += 1
+        return CntrlBase._cntrls - 1
+
+    _version = 0
+    @classmethod
+    def versionCount(cls):
+        cls._version += 1 # Use count for this particular type
+        return cls._version - 1
+
+class L1Cache(RubyCache):
+    resourceStalls = False
+    dataArrayBanks = 2
+    tagArrayBanks = 2
+    dataAccessLatency = 1
+    tagAccessLatency = 1
+    def create(self, size, assoc, options):
+        self.size = MemorySize(size)
+        self.assoc = assoc
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class L2Cache(RubyCache):
+    resourceStalls = False
+    assoc = 16
+    dataArrayBanks = 16
+    tagArrayBanks = 16
+    def create(self, size, assoc, options):
+        self.size = MemorySize(size)
+        self.assoc = assoc
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class CPCntrl(CorePair_Controller, CntrlBase):
+
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        self.L1Icache = L1Cache()
+        self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
+        self.L1D0cache = L1Cache()
+        self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
+        self.L1D1cache = L1Cache()
+        self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
+        self.L2cache = L2Cache()
+        self.L2cache.create(options.l2_size, options.l2_assoc, options)
+
+        self.sequencer = RubySequencer()
+        self.sequencer.version = self.seqCount()
+        self.sequencer.icache = self.L1Icache
+        self.sequencer.dcache = self.L1D0cache
+        self.sequencer.ruby_system = ruby_system
+        self.sequencer.coreid = 0
+        self.sequencer.is_cpu_sequencer = True
+
+        self.sequencer1 = RubySequencer()
+        self.sequencer1.version = self.seqCount()
+        self.sequencer1.icache = self.L1Icache
+        self.sequencer1.dcache = self.L1D1cache
+        self.sequencer1.ruby_system = ruby_system
+        self.sequencer1.coreid = 1
+        self.sequencer1.is_cpu_sequencer = True
+
+        self.issue_latency = options.cpu_to_dir_latency
+        self.send_evictions = send_evicts(options)
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+class TCPCache(RubyCache):
+    size = "16kB"
+    assoc = 16
+    dataArrayBanks = 16
+    tagArrayBanks = 16
+    dataAccessLatency = 4
+    tagAccessLatency = 1
+    def create(self, options):
+        self.size = MemorySize(options.tcp_size)
+        self.dataArrayBanks = 16
+        self.tagArrayBanks = 16
+        self.dataAccessLatency = 4
+        self.tagAccessLatency = 1
+        self.resourceStalls = options.no_tcc_resource_stalls
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class TCPCntrl(TCP_Controller, CntrlBase):
+
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.L1cache = TCPCache()
+        self.L1cache.create(options)
+        self.issue_latency = 1
+
+        self.coalescer = VIPERCoalescer()
+        self.coalescer.version = self.seqCount()
+        self.coalescer.icache = self.L1cache
+        self.coalescer.dcache = self.L1cache
+        self.coalescer.ruby_system = ruby_system
+        self.coalescer.support_inst_reqs = False
+        self.coalescer.is_cpu_sequencer = False
+
+        self.sequencer = RubySequencer()
+        self.sequencer.version = self.seqCount()
+        self.sequencer.icache = self.L1cache
+        self.sequencer.dcache = self.L1cache
+        self.sequencer.ruby_system = ruby_system
+        self.sequencer.is_cpu_sequencer = True
+
+        self.use_seq_not_coal = False
+
+        self.ruby_system = ruby_system
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+class SQCCache(RubyCache):
+    dataArrayBanks = 8
+    tagArrayBanks = 8
+    dataAccessLatency = 1
+    tagAccessLatency = 1
+
+    def create(self, options):
+        self.size = MemorySize(options.sqc_size)
+        self.assoc = options.sqc_assoc
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class SQCCntrl(SQC_Controller, CntrlBase):
+
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.L1cache = SQCCache()
+        self.L1cache.create(options)
+        self.L1cache.resourceStalls = False
+        self.sequencer = RubySequencer()
+        self.sequencer.version = self.seqCount()
+        self.sequencer.icache = self.L1cache
+        self.sequencer.dcache = self.L1cache
+        self.sequencer.ruby_system = ruby_system
+        self.sequencer.support_data_reqs = False
+        self.sequencer.is_cpu_sequencer = False
+        self.ruby_system = ruby_system
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+class TCC(RubyCache):
+    size = MemorySize("256kB")
+    assoc = 16
+    dataAccessLatency = 8
+    tagAccessLatency = 2
+    resourceStalls = True
+    def create(self, options):
+        self.assoc = options.tcc_assoc
+        if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
+          s = options.num_compute_units
+          tcc_size = s * 128
+          tcc_size = str(tcc_size)+'kB'
+          self.size = MemorySize(tcc_size)
+          self.dataArrayBanks = 64
+          self.tagArrayBanks = 64
+        else:
+          self.size = MemorySize(options.tcc_size)
+          self.dataArrayBanks = 256 / options.num_tccs #number of data banks
+          self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
+        self.size.value = self.size.value / options.num_tccs
+        if ((self.size.value / long(self.assoc)) < 128):
+            self.size.value = long(128 * self.assoc)
+        self.start_index_bit = math.log(options.cacheline_size, 2) + \
+                               math.log(options.num_tccs, 2)
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class TCCCntrl(TCC_Controller, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.L2cache = TCC()
+        self.L2cache.create(options)
+        self.ruby_system = ruby_system
+        self.L2cache.resourceStalls = options.no_tcc_resource_stalls
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+class L3Cache(RubyCache):
+    dataArrayBanks = 16
+    tagArrayBanks = 16
+
+    def create(self, options, ruby_system, system):
+        self.size = MemorySize(options.l3_size)
+        self.size.value /= options.num_dirs
+        self.assoc = options.l3_assoc
+        self.dataArrayBanks /= options.num_dirs
+        self.tagArrayBanks /= options.num_dirs
+        self.dataArrayBanks /= options.num_dirs
+        self.tagArrayBanks /= options.num_dirs
+        self.dataAccessLatency = options.l3_data_latency
+        self.tagAccessLatency = options.l3_tag_latency
+        self.resourceStalls = False
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class ProbeFilter(RubyCache):
+    size = "4MB"
+    assoc = 16
+    dataArrayBanks = 256
+    tagArrayBanks = 256
+
+    def create(self, options, ruby_system, system):
+        self.block_size = "%dB" % (64 * options.blocks_per_region)
+        self.size = options.region_dir_entries * \
+            self.block_size * options.num_compute_units
+        self.assoc = 8
+        self.tagArrayBanks = 8
+        self.tagAccessLatency = options.dir_tag_latency
+        self.dataAccessLatency = 1
+        self.resourceStalls = options.no_resource_stalls
+        self.start_index_bit = 6 + int(math.log(options.blocks_per_region, 2))
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class L3Cntrl(L3Cache_Controller, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.L3cache = L3Cache()
+        self.L3cache.create(options, ruby_system, system)
+        self.l3_response_latency = \
+            max(self.L3cache.dataAccessLatency, self.L3cache.tagAccessLatency)
+        self.ruby_system = ruby_system
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
+                           req_to_l3, probe_to_l3, resp_to_l3):
+        self.reqToDir = req_to_dir
+        self.respToDir = resp_to_dir
+        self.l3UnblockToDir = l3_unblock_to_dir
+        self.reqToL3 = req_to_l3
+        self.probeToL3 = probe_to_l3
+        self.respToL3 = resp_to_l3
+
+class DirMem(RubyDirectoryMemory, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        phys_mem_size = AddrRange(options.mem_size).size()
+        mem_module_size = phys_mem_size / options.num_dirs
+        dir_size = MemorySize('0B')
+        dir_size.value = mem_module_size
+        self.size = dir_size
+
+class DirCntrl(Directory_Controller, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.response_latency = 30
+        self.directory = DirMem()
+        self.directory.create(options, ruby_system, system)
+        self.L3CacheMemory = L3Cache()
+        self.L3CacheMemory.create(options, ruby_system, system)
+        self.ProbeFilterMemory = ProbeFilter()
+        self.ProbeFilterMemory.create(options, ruby_system, system)
+        self.l3_hit_latency = \
+            max(self.L3CacheMemory.dataAccessLatency,
+            self.L3CacheMemory.tagAccessLatency)
+
+        self.ruby_system = ruby_system
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
+                           req_to_l3, probe_to_l3, resp_to_l3):
+        self.reqToDir = req_to_dir
+        self.respToDir = resp_to_dir
+        self.l3UnblockToDir = l3_unblock_to_dir
+        self.reqToL3 = req_to_l3
+        self.probeToL3 = probe_to_l3
+        self.respToL3 = resp_to_l3
+
+def define_options(parser):
+    parser.add_option("--num-subcaches", type = "int", default = 4)
+    parser.add_option("--l3-data-latency", type = "int", default = 20)
+    parser.add_option("--l3-tag-latency", type = "int", default = 15)
+    parser.add_option("--cpu-to-dir-latency", type = "int", default = 120)
+    parser.add_option("--gpu-to-dir-latency", type = "int", default = 120)
+    parser.add_option("--no-resource-stalls", action = "store_false",
+                      default = True)
+    parser.add_option("--no-tcc-resource-stalls", action = "store_false",
+                      default = True)
+    parser.add_option("--num-tbes", type = "int", default = 2560)
+    parser.add_option("--l2-latency", type = "int", default = 50)  # load to use
+    parser.add_option("--num-tccs", type = "int", default = 1,
+                      help = "number of TCC banks in the GPU")
+    parser.add_option("--sqc-size", type = 'string', default = '32kB',
+                      help = "SQC cache size")
+    parser.add_option("--sqc-assoc", type = 'int', default = 8,
+                      help = "SQC cache assoc")
+    parser.add_option("--region-dir-entries", type = "int", default = 8192)
+    parser.add_option("--dir-tag-latency", type = "int", default = 8)
+    parser.add_option("--dir-tag-banks", type = "int", default = 4)
+    parser.add_option("--blocks-per-region", type = "int", default = 1)
+    parser.add_option("--use-L3-on-WT", action = "store_true", default = False)
+    parser.add_option("--nonInclusiveDir", action = "store_true",
+                      default = False)
+    parser.add_option("--WB_L1", action = "store_true",
+        default = False, help = "writeback L2")
+    parser.add_option("--WB_L2", action = "store_true",
+        default = False, help = "writeback L2")
+    parser.add_option("--TCP_latency", type = "int",
+        default = 4, help = "TCP latency")
+    parser.add_option("--TCC_latency", type = "int",
+        default = 16, help = "TCC latency")
+    parser.add_option("--tcc-size", type = 'string', default = '2MB',
+                      help = "agregate tcc size")
+    parser.add_option("--tcc-assoc", type = 'int', default = 16,
+                      help = "tcc assoc")
+    parser.add_option("--tcp-size", type = 'string', default = '16kB',
+                      help = "tcp size")
+    parser.add_option("--sampler-sets", type = "int", default = 1024)
+    parser.add_option("--sampler-assoc", type = "int", default = 16)
+    parser.add_option("--sampler-counter", type = "int", default = 512)
+    parser.add_option("--noL1", action = "store_true", default = False,
+                      help = "bypassL1")
+    parser.add_option("--noL2", action = "store_true", default = False,
+                      help = "bypassL2")
+
+def create_system(options, full_system, system, dma_devices, ruby_system):
+    if buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline':
+        panic("This script requires the" \
+        "GPU_VIPER_Baseline protocol to be built.")
+
+    cpu_sequencers = []
+
+    #
+    # The ruby network creation expects the list of nodes in the system to be
+    # consistent with the NetDest list.  Therefore the l1 controller nodes
+    # must be listed before the directory nodes and directory nodes before
+    # dma nodes, etc.
+    #
+    cp_cntrl_nodes = []
+    tcp_cntrl_nodes = []
+    sqc_cntrl_nodes = []
+    tcc_cntrl_nodes = []
+    dir_cntrl_nodes = []
+    l3_cntrl_nodes = []
+
+    #
+    # Must create the individual controllers before the network to ensure the
+    # controller constructors are called before the network constructor
+    #
+
+    # For an odd number of CPUs, still create the right number of controllers
+    TCC_bits = int(math.log(options.num_tccs, 2))
+
+    # This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
+    # Clusters
+    crossbar_bw = 16 * options.num_compute_units #Assuming a 2GHz clock
+    mainCluster = Cluster(intBW = crossbar_bw)
+    for i in xrange(options.num_dirs):
+
+        dir_cntrl = DirCntrl(noTCCdir=True,TCC_select_num_bits = TCC_bits)
+        dir_cntrl.create(options, ruby_system, system)
+        dir_cntrl.number_of_TBEs = options.num_tbes
+        dir_cntrl.useL3OnWT = options.use_L3_on_WT
+        dir_cntrl.inclusiveDir = not options.nonInclusiveDir
+
+        # Connect the Directory controller to the ruby network
+        dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
+        dir_cntrl.requestFromCores.slave = ruby_system.network.master
+
+        dir_cntrl.responseFromCores = MessageBuffer()
+        dir_cntrl.responseFromCores.slave = ruby_system.network.master
+
+        dir_cntrl.unblockFromCores = MessageBuffer()
+        dir_cntrl.unblockFromCores.slave = ruby_system.network.master
+
+        dir_cntrl.probeToCore = MessageBuffer()
+        dir_cntrl.probeToCore.master = ruby_system.network.slave
+
+        dir_cntrl.responseToCore = MessageBuffer()
+        dir_cntrl.responseToCore.master = ruby_system.network.slave
+
+        dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
+        dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
+        dir_cntrl.responseFromMemory = MessageBuffer()
+
+        exec("system.dir_cntrl%d = dir_cntrl" % i)
+        dir_cntrl_nodes.append(dir_cntrl)
+        mainCluster.add(dir_cntrl)
+
+    cpuCluster = Cluster(extBW = crossbar_bw, intBW=crossbar_bw)
+    for i in xrange((options.num_cpus + 1) / 2):
+
+        cp_cntrl = CPCntrl()
+        cp_cntrl.create(options, ruby_system, system)
+
+        exec("system.cp_cntrl%d = cp_cntrl" % i)
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
+
+        # Connect the CP controllers and the network
+        cp_cntrl.requestFromCore = MessageBuffer()
+        cp_cntrl.requestFromCore.master = ruby_system.network.slave
+
+        cp_cntrl.responseFromCore = MessageBuffer()
+        cp_cntrl.responseFromCore.master = ruby_system.network.slave
+
+        cp_cntrl.unblockFromCore = MessageBuffer()
+        cp_cntrl.unblockFromCore.master = ruby_system.network.slave
+
+        cp_cntrl.probeToCore = MessageBuffer()
+        cp_cntrl.probeToCore.slave = ruby_system.network.master
+
+        cp_cntrl.responseToCore = MessageBuffer()
+        cp_cntrl.responseToCore.slave = ruby_system.network.master
+
+        cp_cntrl.mandatoryQueue = MessageBuffer()
+        cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
+
+        cpuCluster.add(cp_cntrl)
+
+    gpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw)
+    for i in xrange(options.num_compute_units):
+
+        tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
+                             issue_latency = 1,
+                             number_of_TBEs = 2560)
+        # TBEs set to max outstanding requests
+        tcp_cntrl.create(options, ruby_system, system)
+        tcp_cntrl.WB = options.WB_L1
+        tcp_cntrl.disableL1 = options.noL1
+
+        exec("system.tcp_cntrl%d = tcp_cntrl" % i)
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.append(tcp_cntrl.coalescer)
+        tcp_cntrl_nodes.append(tcp_cntrl)
+
+        # Connect the CP (TCP) controllers to the ruby network
+        tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
+
+        tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
+
+        tcp_cntrl.unblockFromCore = MessageBuffer()
+        tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
+
+        tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.probeToTCP.slave = ruby_system.network.master
+
+        tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.responseToTCP.slave = ruby_system.network.master
+
+        tcp_cntrl.mandatoryQueue = MessageBuffer()
+
+        gpuCluster.add(tcp_cntrl)
+
+    for i in xrange(options.num_sqc):
+
+        sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
+        sqc_cntrl.create(options, ruby_system, system)
+
+        exec("system.sqc_cntrl%d = sqc_cntrl" % i)
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.append(sqc_cntrl.sequencer)
+
+        # Connect the SQC controller to the ruby network
+        sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
+        sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
+
+        sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
+        sqc_cntrl.probeToSQC.slave = ruby_system.network.master
+
+        sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
+        sqc_cntrl.responseToSQC.slave = ruby_system.network.master
+
+        sqc_cntrl.mandatoryQueue = MessageBuffer()
+
+        # SQC also in GPU cluster
+        gpuCluster.add(sqc_cntrl)
+
+    # Because of wire buffers, num_tccs must equal num_tccdirs
+    numa_bit = 6
+
+    for i in xrange(options.num_tccs):
+
+        tcc_cntrl = TCCCntrl()
+        tcc_cntrl.create(options, ruby_system, system)
+        tcc_cntrl.l2_request_latency = options.gpu_to_dir_latency
+        tcc_cntrl.l2_response_latency = options.TCC_latency
+        tcc_cntrl_nodes.append(tcc_cntrl)
+        tcc_cntrl.WB = options.WB_L2
+        tcc_cntrl.number_of_TBEs = 2560 * options.num_compute_units
+
+        # Connect the TCC controllers to the ruby network
+        tcc_cntrl.requestFromTCP = MessageBuffer(ordered = True)
+        tcc_cntrl.requestFromTCP.slave = ruby_system.network.master
+
+        tcc_cntrl.responseToCore = MessageBuffer(ordered = True)
+        tcc_cntrl.responseToCore.master = ruby_system.network.slave
+
+        tcc_cntrl.probeFromNB = MessageBuffer()
+        tcc_cntrl.probeFromNB.slave = ruby_system.network.master
+
+        tcc_cntrl.responseFromNB = MessageBuffer()
+        tcc_cntrl.responseFromNB.slave = ruby_system.network.master
+
+        tcc_cntrl.requestToNB = MessageBuffer(ordered = True)
+        tcc_cntrl.requestToNB.master = ruby_system.network.slave
+
+        tcc_cntrl.responseToNB = MessageBuffer()
+        tcc_cntrl.responseToNB.master = ruby_system.network.slave
+
+        tcc_cntrl.unblockToNB = MessageBuffer()
+        tcc_cntrl.unblockToNB.master = ruby_system.network.slave
+
+        tcc_cntrl.triggerQueue = MessageBuffer(ordered = True)
+
+        exec("system.tcc_cntrl%d = tcc_cntrl" % i)
+        # connect all of the wire buffers between L3 and dirs up
+        # TCC cntrls added to the GPU cluster
+        gpuCluster.add(tcc_cntrl)
+
+    # Assuming no DMA devices
+    assert(len(dma_devices) == 0)
+
+    # Add cpu/gpu clusters to main cluster
+    mainCluster.add(cpuCluster)
+    mainCluster.add(gpuCluster)
+
+    ruby_system.network.number_of_virtual_networks = 10
+
+    return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
diff --git a/configs/ruby/GPU_VIPER_Region.py b/configs/ruby/GPU_VIPER_Region.py
new file mode 100644
index 000000000..94cb9b70b
--- /dev/null
+++ b/configs/ruby/GPU_VIPER_Region.py
@@ -0,0 +1,758 @@
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Sooraj Puthoor
+#
+
+import math
+import m5
+from m5.objects import *
+from m5.defines import buildEnv
+from Ruby import send_evicts
+
+from Cluster import Cluster
+
+class CntrlBase:
+    _seqs = 0
+    @classmethod
+    def seqCount(cls):
+        # Use SeqCount not class since we need global count
+        CntrlBase._seqs += 1
+        return CntrlBase._seqs - 1
+
+    _cntrls = 0
+    @classmethod
+    def cntrlCount(cls):
+        # Use CntlCount not class since we need global count
+        CntrlBase._cntrls += 1
+        return CntrlBase._cntrls - 1
+
+    _version = 0
+    @classmethod
+    def versionCount(cls):
+        cls._version += 1 # Use count for this particular type
+        return cls._version - 1
+
+#
+# Note: the L1 Cache latency is only used by the sequencer on fast path hits
+#
+class L1Cache(RubyCache):
+    resourceStalls = False
+    dataArrayBanks = 2
+    tagArrayBanks = 2
+    dataAccessLatency = 1
+    tagAccessLatency = 1
+    def create(self, size, assoc, options):
+        self.size = MemorySize(size)
+        self.assoc = assoc
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class L2Cache(RubyCache):
+    resourceStalls = False
+    assoc = 16
+    dataArrayBanks = 16
+    tagArrayBanks = 16
+    def create(self, size, assoc, options):
+        self.size = MemorySize(size)
+        self.assoc = assoc
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class CPCntrl(CorePair_Controller, CntrlBase):
+
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        self.L1Icache = L1Cache()
+        self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
+        self.L1D0cache = L1Cache()
+        self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
+        self.L1D1cache = L1Cache()
+        self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
+        self.L2cache = L2Cache()
+        self.L2cache.create(options.l2_size, options.l2_assoc, options)
+
+        self.sequencer = RubySequencer()
+        self.sequencer.version = self.seqCount()
+        self.sequencer.icache = self.L1Icache
+        self.sequencer.dcache = self.L1D0cache
+        self.sequencer.ruby_system = ruby_system
+        self.sequencer.coreid = 0
+        self.sequencer.is_cpu_sequencer = True
+
+        self.sequencer1 = RubySequencer()
+        self.sequencer1.version = self.seqCount()
+        self.sequencer1.icache = self.L1Icache
+        self.sequencer1.dcache = self.L1D1cache
+        self.sequencer1.ruby_system = ruby_system
+        self.sequencer1.coreid = 1
+        self.sequencer1.is_cpu_sequencer = True
+
+        self.issue_latency = 1
+        self.send_evictions = send_evicts(options)
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+class TCPCache(RubyCache):
+    size = "16kB"
+    assoc = 16
+    dataArrayBanks = 16
+    tagArrayBanks = 16
+    dataAccessLatency = 4
+    tagAccessLatency = 1
+    def create(self, options):
+        self.size = MemorySize(options.tcp_size)
+        self.dataArrayBanks = 16
+        self.tagArrayBanks = 16
+        self.dataAccessLatency = 4
+        self.tagAccessLatency = 1
+        self.resourceStalls = options.no_tcc_resource_stalls
+        self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
+
+class TCPCntrl(TCP_Controller, CntrlBase):
+
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.L1cache = TCPCache(dataAccessLatency = options.TCP_latency)
+        self.L1cache.create(options)
+        self.issue_latency = 1
+
+        self.coalescer = VIPERCoalescer()
+        self.coalescer.version = self.seqCount()
+        self.coalescer.icache = self.L1cache
+        self.coalescer.dcache = self.L1cache
+        self.coalescer.ruby_system = ruby_system
+        self.coalescer.support_inst_reqs = False
+        self.coalescer.is_cpu_sequencer = False
+
+        self.sequencer = RubySequencer()
+        self.sequencer.version = self.seqCount()
+        self.sequencer.icache = self.L1cache
+        self.sequencer.dcache = self.L1cache
+        self.sequencer.ruby_system = ruby_system
+        self.sequencer.is_cpu_sequencer = True
+
+        self.use_seq_not_coal = False
+
+        self.ruby_system = ruby_system
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+class SQCCache(RubyCache):
+    dataArrayBanks = 8
+    tagArrayBanks = 8
+    dataAccessLatency = 1
+    tagAccessLatency = 1
+
+    def create(self, options):
+        self.size = MemorySize(options.sqc_size)
+        self.assoc = options.sqc_assoc
+        self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
+
+class SQCCntrl(SQC_Controller, CntrlBase):
+
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.L1cache = SQCCache()
+        self.L1cache.create(options)
+        self.L1cache.resourceStalls = False
+        self.sequencer = RubySequencer()
+        self.sequencer.version = self.seqCount()
+        self.sequencer.icache = self.L1cache
+        self.sequencer.dcache = self.L1cache
+        self.sequencer.ruby_system = ruby_system
+        self.sequencer.support_data_reqs = False
+        self.sequencer.is_cpu_sequencer = False
+        self.ruby_system = ruby_system
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+class TCC(RubyCache):
+    size = MemorySize("256kB")
+    assoc = 16
+    dataAccessLatency = 8
+    tagAccessLatency = 2
+    resourceStalls = False
+    def create(self, options):
+        self.assoc = options.tcc_assoc
+        if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
+          s = options.num_compute_units
+          tcc_size = s * 128
+          tcc_size = str(tcc_size)+'kB'
+          self.size = MemorySize(tcc_size)
+          self.dataArrayBanks = 64
+          self.tagArrayBanks = 64
+        else:
+          self.size = MemorySize(options.tcc_size)
+          self.dataArrayBanks = 256 / options.num_tccs #number of data banks
+          self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
+        self.size.value = self.size.value / options.num_tccs
+        if ((self.size.value / long(self.assoc)) < 128):
+            self.size.value = long(128 * self.assoc)
+        self.start_index_bit = math.log(options.cacheline_size, 2) + \
+                               math.log(options.num_tccs, 2)
+        self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
+
+class TCCCntrl(TCC_Controller, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.L2cache = TCC()
+        self.L2cache.create(options)
+        self.ruby_system = ruby_system
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+class L3Cache(RubyCache):
+    dataArrayBanks = 16
+    tagArrayBanks = 16
+
+    def create(self, options, ruby_system, system):
+        self.size = MemorySize(options.l3_size)
+        self.size.value /= options.num_dirs
+        self.assoc = options.l3_assoc
+        self.dataArrayBanks /= options.num_dirs
+        self.tagArrayBanks /= options.num_dirs
+        self.dataArrayBanks /= options.num_dirs
+        self.tagArrayBanks /= options.num_dirs
+        self.dataAccessLatency = options.l3_data_latency
+        self.tagAccessLatency = options.l3_tag_latency
+        self.resourceStalls = False
+        self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
+
+class L3Cntrl(L3Cache_Controller, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.L3cache = L3Cache()
+        self.L3cache.create(options, ruby_system, system)
+        self.l3_response_latency = \
+            max(self.L3cache.dataAccessLatency, self.L3cache.tagAccessLatency)
+        self.ruby_system = ruby_system
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
+                           req_to_l3, probe_to_l3, resp_to_l3):
+        self.reqToDir = req_to_dir
+        self.respToDir = resp_to_dir
+        self.l3UnblockToDir = l3_unblock_to_dir
+        self.reqToL3 = req_to_l3
+        self.probeToL3 = probe_to_l3
+        self.respToL3 = resp_to_l3
+
+# Directory memory: Directory memory of infinite size which is
+# used by directory controller to store the "states" of the
+# state machine. The state machine is implemented per cache block
+class DirMem(RubyDirectoryMemory, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        phys_mem_size = AddrRange(options.mem_size).size()
+        mem_module_size = phys_mem_size / options.num_dirs
+        dir_size = MemorySize('0B')
+        dir_size.value = mem_module_size
+        self.size = dir_size
+
+# Directory controller: Contains directory memory, L3 cache and associated state
+# machine which is used to accurately redirect a data request to L3 cache or to
+# memory. The permissions requests do not come to this directory for region
+# based protocols as they are handled exclusively by the region directory.
+# However, region directory controller uses this directory controller for
+# sending probe requests and receiving probe responses.
+class DirCntrl(Directory_Controller, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.response_latency = 25
+        self.response_latency_regionDir = 1
+        self.directory = DirMem()
+        self.directory.create(options, ruby_system, system)
+        self.L3CacheMemory = L3Cache()
+        self.L3CacheMemory.create(options, ruby_system, system)
+        self.l3_hit_latency = \
+            max(self.L3CacheMemory.dataAccessLatency,
+            self.L3CacheMemory.tagAccessLatency)
+
+        self.ruby_system = ruby_system
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
+                           req_to_l3, probe_to_l3, resp_to_l3):
+        self.reqToDir = req_to_dir
+        self.respToDir = resp_to_dir
+        self.l3UnblockToDir = l3_unblock_to_dir
+        self.reqToL3 = req_to_l3
+        self.probeToL3 = probe_to_l3
+        self.respToL3 = resp_to_l3
+
+# Region directory : Stores region permissions
+class RegionDir(RubyCache):
+
+    def create(self, options, ruby_system, system):
+        self.block_size = "%dB" % (64 * options.blocks_per_region)
+        self.size = options.region_dir_entries * \
+            self.block_size * options.num_compute_units
+        self.assoc = 8
+        self.tagArrayBanks = 8
+        self.tagAccessLatency = options.dir_tag_latency
+        self.dataAccessLatency = 1
+        self.resourceStalls = options.no_resource_stalls
+        self.start_index_bit = 6 + int(math.log(options.blocks_per_region, 2))
+        self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
+# Region directory controller : Contains region directory and associated state
+# machine for dealing with region coherence requests.
+class RegionCntrl(RegionDir_Controller, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.cacheMemory = RegionDir()
+        self.cacheMemory.create(options, ruby_system, system)
+        self.blocksPerRegion = options.blocks_per_region
+        self.toDirLatency = \
+            max(self.cacheMemory.dataAccessLatency,
+            self.cacheMemory.tagAccessLatency)
+        self.ruby_system = ruby_system
+        self.always_migrate = options.always_migrate
+        self.sym_migrate = options.symmetric_migrate
+        self.asym_migrate = options.asymmetric_migrate
+        if self.always_migrate:
+            assert(not self.asym_migrate and not self.sym_migrate)
+        if self.sym_migrate:
+            assert(not self.always_migrate and not self.asym_migrate)
+        if self.asym_migrate:
+            assert(not self.always_migrate and not self.sym_migrate)
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+# Region Buffer: A region directory cache which avoids some potential
+# long latency lookup of region directory for getting region permissions
+class RegionBuffer(RubyCache):
+    assoc = 4
+    dataArrayBanks = 256
+    tagArrayBanks = 256
+    dataAccessLatency = 1
+    tagAccessLatency = 1
+    resourceStalls = True
+
+class RBCntrl(RegionBuffer_Controller, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.cacheMemory = RegionBuffer()
+        self.cacheMemory.resourceStalls = options.no_tcc_resource_stalls
+        self.cacheMemory.dataArrayBanks = 64
+        self.cacheMemory.tagArrayBanks = 64
+        self.blocksPerRegion = options.blocks_per_region
+        self.cacheMemory.block_size = "%dB" % (64 * self.blocksPerRegion)
+        self.cacheMemory.start_index_bit = \
+            6 + int(math.log(self.blocksPerRegion, 2))
+        self.cacheMemory.size = options.region_buffer_entries * \
+            self.cacheMemory.block_size * options.num_compute_units
+        self.toDirLatency = options.gpu_to_dir_latency
+        self.toRegionDirLatency = options.cpu_to_dir_latency
+        self.noTCCdir = True
+        TCC_bits = int(math.log(options.num_tccs, 2))
+        self.TCC_select_num_bits = TCC_bits
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+        self.cacheMemory.replacement_policy = \
+            PseudoLRUReplacementPolicy(assoc = self.cacheMemory.assoc)
+
+def define_options(parser):
+    parser.add_option("--num-subcaches", type="int", default=4)
+    parser.add_option("--l3-data-latency", type="int", default=20)
+    parser.add_option("--l3-tag-latency", type="int", default=15)
+    parser.add_option("--cpu-to-dir-latency", type="int", default=120)
+    parser.add_option("--gpu-to-dir-latency", type="int", default=60)
+    parser.add_option("--no-resource-stalls", action="store_false",
+                      default=True)
+    parser.add_option("--no-tcc-resource-stalls", action="store_false",
+                      default=True)
+    parser.add_option("--num-tbes", type="int", default=32)
+    parser.add_option("--l2-latency", type="int", default=50) # load to use
+    parser.add_option("--num-tccs", type="int", default=1,
+                      help="number of TCC banks in the GPU")
+
+    parser.add_option("--sqc-size", type='string', default='32kB',
+                      help="SQC cache size")
+    parser.add_option("--sqc-assoc", type='int', default=8,
+                      help="SQC cache assoc")
+
+    parser.add_option("--WB_L1", action="store_true",
+        default=False, help="L2 Writeback Cache")
+    parser.add_option("--WB_L2", action="store_true",
+        default=False, help="L2 Writeback Cache")
+    parser.add_option("--TCP_latency",
+        type="int", default=4, help="TCP latency")
+    parser.add_option("--TCC_latency",
+        type="int", default=16, help="TCC latency")
+    parser.add_option("--tcc-size", type='string', default='2MB',
+                      help="agregate tcc size")
+    parser.add_option("--tcc-assoc", type='int', default=16,
+                      help="tcc assoc")
+    parser.add_option("--tcp-size", type='string', default='16kB',
+                      help="tcp size")
+
+    parser.add_option("--dir-tag-latency", type="int", default=4)
+    parser.add_option("--dir-tag-banks", type="int", default=4)
+    parser.add_option("--blocks-per-region", type="int", default=16)
+    parser.add_option("--dir-entries", type="int", default=8192)
+
+    # Region buffer is a cache of region directory. Hence region
+    # directory is inclusive with respect to region directory.
+    # However, region directory is non-inclusive with respect to
+    # the caches in the system
+    parser.add_option("--region-dir-entries", type="int", default=1024)
+    parser.add_option("--region-buffer-entries", type="int", default=512)
+
+    parser.add_option("--always-migrate",
+        action="store_true", default=False)
+    parser.add_option("--symmetric-migrate",
+        action="store_true", default=False)
+    parser.add_option("--asymmetric-migrate",
+        action="store_true", default=False)
+    parser.add_option("--use-L3-on-WT", action="store_true", default=False)
+
+def create_system(options, full_system, system, dma_devices, ruby_system):
+    if buildEnv['PROTOCOL'] != 'GPU_VIPER_Region':
+        panic("This script requires the GPU_VIPER_Region protocol to be built.")
+
+    cpu_sequencers = []
+
+    #
+    # The ruby network creation expects the list of nodes in the system to be
+    # consistent with the NetDest list.  Therefore the l1 controller nodes
+    # must be listed before the directory nodes and directory nodes before
+    # dma nodes, etc.
+    #
+    dir_cntrl_nodes = []
+
+    # For an odd number of CPUs, still create the right number of controllers
+    TCC_bits = int(math.log(options.num_tccs, 2))
+
+    #
+    # Must create the individual controllers before the network to ensure the
+    # controller constructors are called before the network constructor
+    #
+
+    # For an odd number of CPUs, still create the right number of controllers
+    crossbar_bw = 16 * options.num_compute_units #Assuming a 2GHz clock
+    cpuCluster = Cluster(extBW = (crossbar_bw), intBW=crossbar_bw)
+    for i in xrange((options.num_cpus + 1) / 2):
+
+        cp_cntrl = CPCntrl()
+        cp_cntrl.create(options, ruby_system, system)
+
+        rb_cntrl = RBCntrl()
+        rb_cntrl.create(options, ruby_system, system)
+        rb_cntrl.number_of_TBEs = 256
+        rb_cntrl.isOnCPU = True
+
+        cp_cntrl.regionBufferNum = rb_cntrl.version
+
+        exec("system.cp_cntrl%d = cp_cntrl" % i)
+        exec("system.rb_cntrl%d = rb_cntrl" % i)
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
+
+        # Connect the CP controllers and the network
+        cp_cntrl.requestFromCore = MessageBuffer()
+        cp_cntrl.requestFromCore.master = ruby_system.network.slave
+
+        cp_cntrl.responseFromCore = MessageBuffer()
+        cp_cntrl.responseFromCore.master = ruby_system.network.slave
+
+        cp_cntrl.unblockFromCore = MessageBuffer()
+        cp_cntrl.unblockFromCore.master = ruby_system.network.slave
+
+        cp_cntrl.probeToCore = MessageBuffer()
+        cp_cntrl.probeToCore.slave = ruby_system.network.master
+
+        cp_cntrl.responseToCore = MessageBuffer()
+        cp_cntrl.responseToCore.slave = ruby_system.network.master
+
+        cp_cntrl.mandatoryQueue = MessageBuffer()
+        cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
+
+        # Connect the RB controllers to the ruby network
+        rb_cntrl.requestFromCore = MessageBuffer(ordered = True)
+        rb_cntrl.requestFromCore.slave = ruby_system.network.master
+
+        rb_cntrl.responseFromCore = MessageBuffer()
+        rb_cntrl.responseFromCore.slave = ruby_system.network.master
+
+        rb_cntrl.requestToNetwork = MessageBuffer()
+        rb_cntrl.requestToNetwork.master = ruby_system.network.slave
+
+        rb_cntrl.notifyFromRegionDir = MessageBuffer()
+        rb_cntrl.notifyFromRegionDir.slave = ruby_system.network.master
+
+        rb_cntrl.probeFromRegionDir = MessageBuffer()
+        rb_cntrl.probeFromRegionDir.slave = ruby_system.network.master
+
+        rb_cntrl.unblockFromDir = MessageBuffer()
+        rb_cntrl.unblockFromDir.slave = ruby_system.network.master
+
+        rb_cntrl.responseToRegDir = MessageBuffer()
+        rb_cntrl.responseToRegDir.master = ruby_system.network.slave
+
+        rb_cntrl.triggerQueue = MessageBuffer(ordered = True)
+
+        cpuCluster.add(cp_cntrl)
+        cpuCluster.add(rb_cntrl)
+
+    gpuCluster = Cluster(extBW = (crossbar_bw), intBW = crossbar_bw)
+    for i in xrange(options.num_compute_units):
+
+        tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
+                             issue_latency = 1,
+                             number_of_TBEs = 2560)
+        # TBEs set to max outstanding requests
+        tcp_cntrl.create(options, ruby_system, system)
+        tcp_cntrl.WB = options.WB_L1
+        tcp_cntrl.disableL1 = False
+
+        exec("system.tcp_cntrl%d = tcp_cntrl" % i)
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.append(tcp_cntrl.coalescer)
+
+        # Connect the CP (TCP) controllers to the ruby network
+        tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
+
+        tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
+
+        tcp_cntrl.unblockFromCore = MessageBuffer()
+        tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
+
+        tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.probeToTCP.slave = ruby_system.network.master
+
+        tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
+        tcp_cntrl.responseToTCP.slave = ruby_system.network.master
+
+        tcp_cntrl.mandatoryQueue = MessageBuffer()
+
+        gpuCluster.add(tcp_cntrl)
+
+    for i in xrange(options.num_sqc):
+
+        sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
+        sqc_cntrl.create(options, ruby_system, system)
+
+        exec("system.sqc_cntrl%d = sqc_cntrl" % i)
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.append(sqc_cntrl.sequencer)
+
+        # Connect the SQC controller to the ruby network
+        sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
+        sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
+
+        sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
+        sqc_cntrl.probeToSQC.slave = ruby_system.network.master
+
+        sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
+        sqc_cntrl.responseToSQC.slave = ruby_system.network.master
+
+        sqc_cntrl.mandatoryQueue = MessageBuffer()
+
+        # SQC also in GPU cluster
+        gpuCluster.add(sqc_cntrl)
+
+    numa_bit = 6
+
+    for i in xrange(options.num_tccs):
+
+        tcc_cntrl = TCCCntrl()
+        tcc_cntrl.create(options, ruby_system, system)
+        tcc_cntrl.l2_request_latency = 1
+        tcc_cntrl.l2_response_latency = options.TCC_latency
+        tcc_cntrl.WB = options.WB_L2
+        tcc_cntrl.number_of_TBEs = 2560 * options.num_compute_units
+
+        # Connect the TCC controllers to the ruby network
+        tcc_cntrl.requestFromTCP = MessageBuffer(ordered = True)
+        tcc_cntrl.requestFromTCP.slave = ruby_system.network.master
+
+        tcc_cntrl.responseToCore = MessageBuffer(ordered = True)
+        tcc_cntrl.responseToCore.master = ruby_system.network.slave
+
+        tcc_cntrl.probeFromNB = MessageBuffer()
+        tcc_cntrl.probeFromNB.slave = ruby_system.network.master
+
+        tcc_cntrl.responseFromNB = MessageBuffer()
+        tcc_cntrl.responseFromNB.slave = ruby_system.network.master
+
+        tcc_cntrl.requestToNB = MessageBuffer(ordered = True)
+        tcc_cntrl.requestToNB.master = ruby_system.network.slave
+
+        tcc_cntrl.responseToNB = MessageBuffer()
+        tcc_cntrl.responseToNB.master = ruby_system.network.slave
+
+        tcc_cntrl.unblockToNB = MessageBuffer()
+        tcc_cntrl.unblockToNB.master = ruby_system.network.slave
+
+        tcc_cntrl.triggerQueue = MessageBuffer(ordered = True)
+
+        rb_cntrl = RBCntrl()
+        rb_cntrl.create(options, ruby_system, system)
+        rb_cntrl.number_of_TBEs = 2560 * options.num_compute_units
+        rb_cntrl.isOnCPU = False
+
+        # Connect the RB controllers to the ruby network
+        rb_cntrl.requestFromCore = MessageBuffer(ordered = True)
+        rb_cntrl.requestFromCore.slave = ruby_system.network.master
+
+        rb_cntrl.responseFromCore = MessageBuffer()
+        rb_cntrl.responseFromCore.slave = ruby_system.network.master
+
+        rb_cntrl.requestToNetwork = MessageBuffer()
+        rb_cntrl.requestToNetwork.master = ruby_system.network.slave
+
+        rb_cntrl.notifyFromRegionDir = MessageBuffer()
+        rb_cntrl.notifyFromRegionDir.slave = ruby_system.network.master
+
+        rb_cntrl.probeFromRegionDir = MessageBuffer()
+        rb_cntrl.probeFromRegionDir.slave = ruby_system.network.master
+
+        rb_cntrl.unblockFromDir = MessageBuffer()
+        rb_cntrl.unblockFromDir.slave = ruby_system.network.master
+
+        rb_cntrl.responseToRegDir = MessageBuffer()
+        rb_cntrl.responseToRegDir.master = ruby_system.network.slave
+
+        rb_cntrl.triggerQueue = MessageBuffer(ordered = True)
+
+        tcc_cntrl.regionBufferNum = rb_cntrl.version
+
+        exec("system.tcc_cntrl%d = tcc_cntrl" % i)
+        exec("system.tcc_rb_cntrl%d = rb_cntrl" % i)
+
+        # TCC cntrls added to the GPU cluster
+        gpuCluster.add(tcc_cntrl)
+        gpuCluster.add(rb_cntrl)
+
+    # Because of wire buffers, num_l3caches must equal num_dirs
+    # Region coherence only works with 1 dir
+    assert(options.num_l3caches == options.num_dirs == 1)
+
+    # This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
+    # Clusters
+    mainCluster = Cluster(intBW = crossbar_bw)
+
+    dir_cntrl = DirCntrl()
+    dir_cntrl.create(options, ruby_system, system)
+    dir_cntrl.number_of_TBEs = 2560 * options.num_compute_units
+    dir_cntrl.useL3OnWT = options.use_L3_on_WT
+
+    # Connect the Directory controller to the ruby network
+    dir_cntrl.requestFromCores = MessageBuffer()
+    dir_cntrl.requestFromCores.slave = ruby_system.network.master
+
+    dir_cntrl.responseFromCores = MessageBuffer()
+    dir_cntrl.responseFromCores.slave = ruby_system.network.master
+
+    dir_cntrl.unblockFromCores = MessageBuffer()
+    dir_cntrl.unblockFromCores.slave = ruby_system.network.master
+
+    dir_cntrl.probeToCore = MessageBuffer()
+    dir_cntrl.probeToCore.master = ruby_system.network.slave
+
+    dir_cntrl.responseToCore = MessageBuffer()
+    dir_cntrl.responseToCore.master = ruby_system.network.slave
+
+    dir_cntrl.reqFromRegBuf = MessageBuffer()
+    dir_cntrl.reqFromRegBuf.slave = ruby_system.network.master
+
+    dir_cntrl.reqToRegDir = MessageBuffer(ordered = True)
+    dir_cntrl.reqToRegDir.master = ruby_system.network.slave
+
+    dir_cntrl.reqFromRegDir = MessageBuffer(ordered = True)
+    dir_cntrl.reqFromRegDir.slave = ruby_system.network.master
+
+    dir_cntrl.unblockToRegDir = MessageBuffer()
+    dir_cntrl.unblockToRegDir.master = ruby_system.network.slave
+
+    dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
+    dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
+    dir_cntrl.responseFromMemory = MessageBuffer()
+
+    exec("system.dir_cntrl%d = dir_cntrl" % i)
+    dir_cntrl_nodes.append(dir_cntrl)
+
+    mainCluster.add(dir_cntrl)
+
+    reg_cntrl = RegionCntrl(noTCCdir=True,TCC_select_num_bits = TCC_bits)
+    reg_cntrl.create(options, ruby_system, system)
+    reg_cntrl.number_of_TBEs = options.num_tbes
+    reg_cntrl.cpuRegionBufferNum = system.rb_cntrl0.version
+    reg_cntrl.gpuRegionBufferNum = system.tcc_rb_cntrl0.version
+
+    # Connect the Region Dir controllers to the ruby network
+    reg_cntrl.requestToDir = MessageBuffer(ordered = True)
+    reg_cntrl.requestToDir.master = ruby_system.network.slave
+
+    reg_cntrl.notifyToRBuffer = MessageBuffer()
+    reg_cntrl.notifyToRBuffer.master = ruby_system.network.slave
+
+    reg_cntrl.probeToRBuffer = MessageBuffer()
+    reg_cntrl.probeToRBuffer.master = ruby_system.network.slave
+
+    reg_cntrl.responseFromRBuffer = MessageBuffer()
+    reg_cntrl.responseFromRBuffer.slave = ruby_system.network.master
+
+    reg_cntrl.requestFromRegBuf = MessageBuffer()
+    reg_cntrl.requestFromRegBuf.slave = ruby_system.network.master
+
+    reg_cntrl.triggerQueue = MessageBuffer(ordered = True)
+
+    exec("system.reg_cntrl%d = reg_cntrl" % i)
+
+    mainCluster.add(reg_cntrl)
+
+    # Assuming no DMA devices
+    assert(len(dma_devices) == 0)
+
+    # Add cpu/gpu clusters to main cluster
+    mainCluster.add(cpuCluster)
+    mainCluster.add(gpuCluster)
+
+    ruby_system.network.number_of_virtual_networks = 10
+
+    return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
diff --git a/configs/ruby/MOESI_AMD_Base.py b/configs/ruby/MOESI_AMD_Base.py
new file mode 100644
index 000000000..4c8ad28b0
--- /dev/null
+++ b/configs/ruby/MOESI_AMD_Base.py
@@ -0,0 +1,326 @@
+#
+#  Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Lisa Hsu
+#
+
+import math
+import m5
+from m5.objects import *
+from m5.defines import buildEnv
+from Ruby import create_topology
+from Ruby import send_evicts
+
+from Cluster import Cluster
+from Crossbar import Crossbar
+
+class CntrlBase:
+    _seqs = 0
+    @classmethod
+    def seqCount(cls):
+        # Use SeqCount not class since we need global count
+        CntrlBase._seqs += 1
+        return CntrlBase._seqs - 1
+
+    _cntrls = 0
+    @classmethod
+    def cntrlCount(cls):
+        # Use CntlCount not class since we need global count
+        CntrlBase._cntrls += 1
+        return CntrlBase._cntrls - 1
+
+    _version = 0
+    @classmethod
+    def versionCount(cls):
+        cls._version += 1 # Use count for this particular type
+        return cls._version - 1
+
+class L1DCache(RubyCache):
+    resourceStalls = False
+    def create(self, options):
+        self.size = MemorySize(options.l1d_size)
+        self.assoc = options.l1d_assoc
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class L1ICache(RubyCache):
+    resourceStalls = False
+    def create(self, options):
+        self.size = MemorySize(options.l1i_size)
+        self.assoc = options.l1i_assoc
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class L2Cache(RubyCache):
+    resourceStalls = False
+    def create(self, options):
+        self.size = MemorySize(options.l2_size)
+        self.assoc = options.l2_assoc
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class CPCntrl(CorePair_Controller, CntrlBase):
+
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        self.L1Icache = L1ICache()
+        self.L1Icache.create(options)
+        self.L1D0cache = L1DCache()
+        self.L1D0cache.create(options)
+        self.L1D1cache = L1DCache()
+        self.L1D1cache.create(options)
+        self.L2cache = L2Cache()
+        self.L2cache.create(options)
+
+        self.sequencer = RubySequencer()
+        self.sequencer.icache_hit_latency = 2
+        self.sequencer.dcache_hit_latency = 2
+        self.sequencer.version = self.seqCount()
+        self.sequencer.icache = self.L1Icache
+        self.sequencer.dcache = self.L1D0cache
+        self.sequencer.ruby_system = ruby_system
+        self.sequencer.coreid = 0
+        self.sequencer.is_cpu_sequencer = True
+
+        self.sequencer1 = RubySequencer()
+        self.sequencer1.version = self.seqCount()
+        self.sequencer1.icache = self.L1Icache
+        self.sequencer1.dcache = self.L1D1cache
+        self.sequencer1.icache_hit_latency = 2
+        self.sequencer1.dcache_hit_latency = 2
+        self.sequencer1.ruby_system = ruby_system
+        self.sequencer1.coreid = 1
+        self.sequencer1.is_cpu_sequencer = True
+
+        self.issue_latency = options.cpu_to_dir_latency
+        self.send_evictions = send_evicts(options)
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+class L3Cache(RubyCache):
+    assoc = 8
+    dataArrayBanks = 256
+    tagArrayBanks = 256
+
+    def create(self, options, ruby_system, system):
+        self.size = MemorySize(options.l3_size)
+        self.size.value /= options.num_dirs
+        self.dataArrayBanks /= options.num_dirs
+        self.tagArrayBanks /= options.num_dirs
+        self.dataArrayBanks /= options.num_dirs
+        self.tagArrayBanks /= options.num_dirs
+        self.dataAccessLatency = options.l3_data_latency
+        self.tagAccessLatency = options.l3_tag_latency
+        self.resourceStalls = options.no_resource_stalls
+        self.replacement_policy = PseudoLRUReplacementPolicy()
+
+class L3Cntrl(L3Cache_Controller, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+        self.L3cache = L3Cache()
+        self.L3cache.create(options, ruby_system, system)
+
+        self.l3_response_latency = max(self.L3cache.dataAccessLatency,
+                                       self.L3cache.tagAccessLatency)
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
+                           req_to_l3, probe_to_l3, resp_to_l3):
+        self.reqToDir = req_to_dir
+        self.respToDir = resp_to_dir
+        self.l3UnblockToDir = l3_unblock_to_dir
+        self.reqToL3 = req_to_l3
+        self.probeToL3 = probe_to_l3
+        self.respToL3 = resp_to_l3
+
+class DirMem(RubyDirectoryMemory, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        phys_mem_size = AddrRange(options.mem_size).size()
+        mem_module_size = phys_mem_size / options.num_dirs
+        dir_size = MemorySize('0B')
+        dir_size.value = mem_module_size
+        self.size = dir_size
+
+class DirCntrl(Directory_Controller, CntrlBase):
+    def create(self, options, ruby_system, system):
+        self.version = self.versionCount()
+
+        self.response_latency = 30
+
+        self.directory = DirMem()
+        self.directory.create(options, ruby_system, system)
+
+        self.L3CacheMemory = L3Cache()
+        self.L3CacheMemory.create(options, ruby_system, system)
+
+        self.l3_hit_latency = max(self.L3CacheMemory.dataAccessLatency,
+                                  self.L3CacheMemory.tagAccessLatency)
+
+        self.number_of_TBEs = options.num_tbes
+
+        self.ruby_system = ruby_system
+
+        if options.recycle_latency:
+            self.recycle_latency = options.recycle_latency
+
+        self.CPUonly = True
+
+    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
+                           req_to_l3, probe_to_l3, resp_to_l3):
+        self.reqToDir = req_to_dir
+        self.respToDir = resp_to_dir
+        self.l3UnblockToDir = l3_unblock_to_dir
+        self.reqToL3 = req_to_l3
+        self.probeToL3 = probe_to_l3
+        self.respToL3 = resp_to_l3
+
+def define_options(parser):
+    parser.add_option("--num-subcaches", type="int", default=4)
+    parser.add_option("--l3-data-latency", type="int", default=20)
+    parser.add_option("--l3-tag-latency", type="int", default=15)
+    parser.add_option("--cpu-to-dir-latency", type="int", default=15)
+    parser.add_option("--no-resource-stalls", action="store_false",
+                      default=True)
+    parser.add_option("--num-tbes", type="int", default=256)
+    parser.add_option("--l2-latency", type="int", default=50) # load to use
+
+def create_system(options, full_system, system, dma_devices, ruby_system):
+    if buildEnv['PROTOCOL'] != 'MOESI_AMD_Base':
+        panic("This script requires the MOESI_AMD_Base protocol.")
+
+    cpu_sequencers = []
+
+    #
+    # The ruby network creation expects the list of nodes in the system to
+    # be consistent with the NetDest list.  Therefore the l1 controller
+    # nodes must be listed before the directory nodes and directory nodes
+    # before dma nodes, etc.
+    #
+    l1_cntrl_nodes = []
+    l3_cntrl_nodes = []
+    dir_cntrl_nodes = []
+
+    control_count = 0
+
+    #
+    # Must create the individual controllers before the network to ensure
+    # the controller constructors are called before the network constructor
+    #
+
+    # This is the base crossbar that connects the L3s, Dirs, and cpu
+    # Cluster
+    mainCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s
+    for i in xrange(options.num_dirs):
+
+        dir_cntrl = DirCntrl(TCC_select_num_bits = 0)
+        dir_cntrl.create(options, ruby_system, system)
+
+        # Connect the Directory controller to the ruby network
+        dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
+        dir_cntrl.requestFromCores.slave = ruby_system.network.master
+
+        dir_cntrl.responseFromCores = MessageBuffer()
+        dir_cntrl.responseFromCores.slave = ruby_system.network.master
+
+        dir_cntrl.unblockFromCores = MessageBuffer()
+        dir_cntrl.unblockFromCores.slave = ruby_system.network.master
+
+        dir_cntrl.probeToCore = MessageBuffer()
+        dir_cntrl.probeToCore.master = ruby_system.network.slave
+
+        dir_cntrl.responseToCore = MessageBuffer()
+        dir_cntrl.responseToCore.master = ruby_system.network.slave
+
+        dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
+        dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
+        dir_cntrl.responseFromMemory = MessageBuffer()
+
+        exec("system.dir_cntrl%d = dir_cntrl" % i)
+        dir_cntrl_nodes.append(dir_cntrl)
+
+        mainCluster.add(dir_cntrl)
+
+    # Technically this config can support an odd number of cpus, but the top
+    # level config files, such as the ruby_random_tester, will get confused if
+    # the number of cpus does not equal the number of sequencers.  Thus make
+    # sure that an even number of cpus is specified.
+    assert((options.num_cpus % 2) == 0)
+
+    # For an odd number of CPUs, still create the right number of controllers
+    cpuCluster = Cluster(extBW = 512, intBW = 512)  # 1 TB/s
+    for i in xrange((options.num_cpus + 1) / 2):
+
+        cp_cntrl = CPCntrl()
+        cp_cntrl.create(options, ruby_system, system)
+
+        exec("system.cp_cntrl%d = cp_cntrl" % i)
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
+
+        # Connect the CP controllers and the network
+        cp_cntrl.requestFromCore = MessageBuffer()
+        cp_cntrl.requestFromCore.master = ruby_system.network.slave
+
+        cp_cntrl.responseFromCore = MessageBuffer()
+        cp_cntrl.responseFromCore.master = ruby_system.network.slave
+
+        cp_cntrl.unblockFromCore = MessageBuffer()
+        cp_cntrl.unblockFromCore.master = ruby_system.network.slave
+
+        cp_cntrl.probeToCore = MessageBuffer()
+        cp_cntrl.probeToCore.slave = ruby_system.network.master
+
+        cp_cntrl.responseToCore = MessageBuffer()
+        cp_cntrl.responseToCore.slave = ruby_system.network.master
+
+        cp_cntrl.mandatoryQueue = MessageBuffer()
+        cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
+
+        cpuCluster.add(cp_cntrl)
+
+    # Assuming no DMA devices
+    assert(len(dma_devices) == 0)
+
+    # Add cpu/gpu clusters to main cluster
+    mainCluster.add(cpuCluster)
+
+    ruby_system.network.number_of_virtual_networks = 10
+
+    return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
diff --git a/src/SConscript b/src/SConscript
index 322212cb7..2bac0bff3 100755
--- a/src/SConscript
+++ b/src/SConscript
@@ -78,7 +78,7 @@ class SourceMeta(type):
     def __init__(cls, name, bases, dict):
         super(SourceMeta, cls).__init__(name, bases, dict)
         cls.all = []
-        
+
     def get(cls, **guards):
         '''Find all files that match the specified guards.  If a source
         file does not specify a flag, the default is False'''
@@ -367,9 +367,9 @@ def makeTheISA(source, target, env):
     target_isa = env['TARGET_ISA']
     def define(isa):
         return isa.upper() + '_ISA'
-    
+
     def namespace(isa):
-        return isa[0].upper() + isa[1:].lower() + 'ISA' 
+        return isa[0].upper() + isa[1:].lower() + 'ISA'
 
 
     code = code_formatter()
@@ -407,6 +407,51 @@ def makeTheISA(source, target, env):
 env.Command('config/the_isa.hh', map(Value, all_isa_list),
             MakeAction(makeTheISA, Transform("CFG ISA", 0)))
 
+def makeTheGPUISA(source, target, env):
+    isas = [ src.get_contents() for src in source ]
+    target_gpu_isa = env['TARGET_GPU_ISA']
+    def define(isa):
+        return isa.upper() + '_ISA'
+
+    def namespace(isa):
+        return isa[0].upper() + isa[1:].lower() + 'ISA'
+
+
+    code = code_formatter()
+    code('''\
+#ifndef __CONFIG_THE_GPU_ISA_HH__
+#define __CONFIG_THE_GPU_ISA_HH__
+
+''')
+
+    # create defines for the preprocessing and compile-time determination
+    for i,isa in enumerate(isas):
+        code('#define $0 $1', define(isa), i + 1)
+    code()
+
+    # create an enum for any run-time determination of the ISA, we
+    # reuse the same name as the namespaces
+    code('enum class GPUArch {')
+    for i,isa in enumerate(isas):
+        if i + 1 == len(isas):
+            code('  $0 = $1', namespace(isa), define(isa))
+        else:
+            code('  $0 = $1,', namespace(isa), define(isa))
+    code('};')
+
+    code('''
+
+#define THE_GPU_ISA ${{define(target_gpu_isa)}}
+#define TheGpuISA ${{namespace(target_gpu_isa)}}
+#define THE_GPU_ISA_STR "${{target_gpu_isa}}"
+
+#endif // __CONFIG_THE_GPU_ISA_HH__''')
+
+    code.write(str(target[0]))
+
+env.Command('config/the_gpu_isa.hh', map(Value, all_gpu_isa_list),
+            MakeAction(makeTheGPUISA, Transform("CFG ISA", 0)))
+
 ########################################################################
 #
 # Prevent any SimObjects from being added after this point, they
@@ -784,7 +829,7 @@ extern "C" {
 EmbeddedSwig embed_swig_${module}(init_${module});
 ''')
     code.write(str(target[0]))
-    
+
 # Build all swig modules
 for swig in SwigSource.all:
     env.Command([swig.cc_source.tnode, swig.py_source.tnode], swig.tnode,
@@ -959,7 +1004,7 @@ const uint8_t data_${sym}[] = {
         x = array.array('B', data[i:i+step])
         code(''.join('%d,' % d for d in x))
     code.dedent()
-    
+
     code('''};
 
 EmbeddedPython embedded_${sym}(
diff --git a/src/arch/SConscript b/src/arch/SConscript
index e0d6845f5..b022cb01f 100644
--- a/src/arch/SConscript
+++ b/src/arch/SConscript
@@ -68,6 +68,14 @@ isa_switch_hdrs = Split('''
 # Set up this directory to support switching headers
 make_switching_dir('arch', isa_switch_hdrs, env)
 
+if env['BUILD_GPU']:
+    gpu_isa_switch_hdrs = Split('''
+            gpu_decoder.hh
+            gpu_types.hh
+            ''')
+
+    make_gpu_switching_dir('arch', gpu_isa_switch_hdrs, env)
+
 #################################################################
 #
 # Include architecture-specific files.
diff --git a/src/arch/hsail/Brig.h b/src/arch/hsail/Brig.h
new file mode 100644
index 000000000..b260157ab
--- /dev/null
+++ b/src/arch/hsail/Brig.h
@@ -0,0 +1,67 @@
+// University of Illinois/NCSA
+// Open Source License
+//
+// Copyright (c) 2013, Advanced Micro Devices, Inc.
+// All rights reserved.
+//
+// Developed by:
+//
+//     HSA Team
+//
+//     Advanced Micro Devices, Inc
+//
+//     www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of
+// this software and associated documentation files (the "Software"), to deal with
+// the Software without restriction, including without limitation the rights to
+// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+// of the Software, and to permit persons to whom the Software is furnished to do
+// so, subject to the following conditions:
+//
+//     * Redistributions of source code must retain the above copyright notice,
+//       this list of conditions and the following disclaimers.
+//
+//     * Redistributions in binary form must reproduce the above copyright notice,
+//       this list of conditions and the following disclaimers in the
+//       documentation and/or other materials provided with the distribution.
+//
+//     * Neither the names of the LLVM Team, University of Illinois at
+//       Urbana-Champaign, nor the names of its contributors may be used to
+//       endorse or promote products derived from this Software without specific
+//       prior written permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+// SOFTWARE.
+#ifndef INTERNAL_BRIG_H
+#define INTERNAL_BRIG_H
+
+#include <stdint.h>
+
+namespace Brig {
+#include "Brig_new.hpp"
+
+// These typedefs provide some backward compatibility with earlier versions
+// of Brig.h, reducing the number of code changes. The distinct names also
+// increase legibility by showing the code's intent.
+typedef BrigBase BrigDirective;
+typedef BrigBase BrigOperand;
+
+enum BrigMemoryFenceSegments { // for internal use only
+    //.mnemo={ s/^BRIG_MEMORY_FENCE_SEGMENT_//;lc }
+    //.mnemo_token=_EMMemoryFenceSegments
+    //.mnemo_context=EInstModifierInstFenceContext
+    BRIG_MEMORY_FENCE_SEGMENT_GLOBAL = 0,
+    BRIG_MEMORY_FENCE_SEGMENT_GROUP = 1,
+    BRIG_MEMORY_FENCE_SEGMENT_IMAGE = 2,
+    BRIG_MEMORY_FENCE_SEGMENT_LAST = 3 //.skip
+};
+
+}
+
+#endif // defined(INTERNAL_BRIG_H)
diff --git a/src/arch/hsail/Brig_new.hpp b/src/arch/hsail/Brig_new.hpp
new file mode 100644
index 000000000..60e6f4dea
--- /dev/null
+++ b/src/arch/hsail/Brig_new.hpp
@@ -0,0 +1,1587 @@
+// University of Illinois/NCSA
+// Open Source License
+//
+// Copyright (c) 2013-2015, Advanced Micro Devices, Inc.
+// All rights reserved.
+//
+// Developed by:
+//
+//     HSA Team
+//
+//     Advanced Micro Devices, Inc
+//
+//     www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of
+// this software and associated documentation files (the "Software"), to deal with
+// the Software without restriction, including without limitation the rights to
+// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+// of the Software, and to permit persons to whom the Software is furnished to do
+// so, subject to the following conditions:
+//
+//     * Redistributions of source code must retain the above copyright notice,
+//       this list of conditions and the following disclaimers.
+//
+//     * Redistributions in binary form must reproduce the above copyright notice,
+//       this list of conditions and the following disclaimers in the
+//       documentation and/or other materials provided with the distribution.
+//
+//     * Neither the names of the LLVM Team, University of Illinois at
+//       Urbana-Champaign, nor the names of its contributors may be used to
+//       endorse or promote products derived from this Software without specific
+//       prior written permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+// SOFTWARE.
+
+//.ignore{
+
+#ifndef INCLUDED_BRIG_H
+#define INCLUDED_BRIG_H
+
+#include <stdint.h>
+
+enum BrigAuxDefs {
+  MAX_OPERANDS_NUM = 6
+};
+
+//}
+
+typedef uint32_t BrigVersion32_t;
+
+enum BrigVersion {
+
+    //.nowrap
+    //.nodump
+    //.nollvm
+
+    BRIG_VERSION_HSAIL_MAJOR = 1,
+    BRIG_VERSION_HSAIL_MINOR = 0,
+    BRIG_VERSION_BRIG_MAJOR  = 1,
+    BRIG_VERSION_BRIG_MINOR  = 0
+};
+
+typedef uint8_t BrigAlignment8_t;                           //.defValue=BRIG_ALIGNMENT_NONE
+
+typedef uint8_t BrigAllocation8_t;                          //.defValue=BRIG_ALLOCATION_NONE
+
+typedef uint8_t BrigAluModifier8_t;
+
+typedef uint8_t BrigAtomicOperation8_t;
+
+typedef uint32_t BrigCodeOffset32_t;                        //.defValue=0   //.wtype=ItemRef<Code>
+
+typedef uint8_t BrigCompareOperation8_t;
+
+typedef uint16_t BrigControlDirective16_t;
+
+typedef uint32_t BrigDataOffset32_t;
+
+typedef BrigDataOffset32_t BrigDataOffsetCodeList32_t;      //.wtype=ListRef<Code>      //.defValue=0
+
+typedef BrigDataOffset32_t BrigDataOffsetOperandList32_t;   //.wtype=ListRef<Operand>   //.defValue=0
+
+typedef BrigDataOffset32_t BrigDataOffsetString32_t;        //.wtype=StrRef             //.defValue=0
+
+typedef uint8_t BrigExecutableModifier8_t;
+
+typedef uint8_t BrigImageChannelOrder8_t;                   //.defValue=BRIG_CHANNEL_ORDER_UNKNOWN
+
+typedef uint8_t BrigImageChannelType8_t;                    //.defValue=BRIG_CHANNEL_TYPE_UNKNOWN
+
+typedef uint8_t BrigImageGeometry8_t;                       //.defValue=BRIG_GEOMETRY_UNKNOWN
+
+typedef uint8_t BrigImageQuery8_t;
+
+typedef uint16_t BrigKind16_t;
+
+typedef uint8_t BrigLinkage8_t;                             //.defValue=BRIG_LINKAGE_NONE
+
+typedef uint8_t BrigMachineModel8_t;                        //.defValue=BRIG_MACHINE_LARGE
+
+typedef uint8_t BrigMemoryModifier8_t;
+
+typedef uint8_t BrigMemoryOrder8_t;                         //.defValue=BRIG_MEMORY_ORDER_RELAXED
+
+typedef uint8_t BrigMemoryScope8_t;                         //.defValue=BRIG_MEMORY_SCOPE_SYSTEM
+
+typedef uint16_t BrigOpcode16_t;
+
+typedef uint32_t BrigOperandOffset32_t;                     //.defValue=0 //.wtype=ItemRef<Operand>
+
+typedef uint8_t BrigPack8_t;                                //.defValue=BRIG_PACK_NONE
+
+typedef uint8_t BrigProfile8_t;                             //.defValue=BRIG_PROFILE_FULL
+
+typedef uint16_t BrigRegisterKind16_t;
+
+typedef uint8_t BrigRound8_t;                               //.defValue=BRIG_ROUND_NONE
+
+typedef uint8_t BrigSamplerAddressing8_t;                   //.defValue=BRIG_ADDRESSING_CLAMP_TO_EDGE
+
+typedef uint8_t BrigSamplerCoordNormalization8_t;
+
+typedef uint8_t BrigSamplerFilter8_t;
+
+typedef uint8_t BrigSamplerQuery8_t;
+
+typedef uint32_t BrigSectionIndex32_t;
+
+typedef uint8_t BrigSegCvtModifier8_t;
+
+typedef uint8_t BrigSegment8_t;                             //.defValue=BRIG_SEGMENT_NONE
+
+typedef uint32_t BrigStringOffset32_t;                      //.defValue=0       //.wtype=StrRef
+
+typedef uint16_t BrigType16_t;
+
+typedef uint8_t BrigVariableModifier8_t;
+
+typedef uint8_t BrigWidth8_t;
+
+typedef uint32_t BrigExceptions32_t;
+
+enum BrigKind {
+
+    //.nollvm
+    //
+    //.wname={ s/^BRIG_KIND//; MACRO2Name($_) }
+    //.mnemo=$wname{ $wname }
+    //
+    //.sizeof=$wname{ "sizeof(".$structs->{"Brig".$wname}->{rawbrig}.")" }
+    //.sizeof_switch //.sizeof_proto="int size_of_brig_record(unsigned arg)" //.sizeof_default="return -1"
+    //
+    //.isBodyOnly={ "false" }
+    //.isBodyOnly_switch //.isBodyOnly_proto="bool isBodyOnly(Directive d)" //.isBodyOnly_arg="d.kind()"
+    //.isBodyOnly_default="assert(false); return false"
+    //
+    //.isToplevelOnly={ "false" }
+    //.isToplevelOnly_switch //.isToplevelOnly_proto="bool isToplevelOnly(Directive d)" //.isToplevelOnly_arg="d.kind()"
+    //.isToplevelOnly_default="assert(false); return false"
+
+    BRIG_KIND_NONE = 0x0000,                        //.skip
+
+    BRIG_KIND_DIRECTIVE_BEGIN = 0x1000,             //.skip
+    BRIG_KIND_DIRECTIVE_ARG_BLOCK_END = 0x1000,     //.isBodyOnly=true
+    BRIG_KIND_DIRECTIVE_ARG_BLOCK_START = 0x1001,   //.isBodyOnly=true
+    BRIG_KIND_DIRECTIVE_COMMENT = 0x1002,
+    BRIG_KIND_DIRECTIVE_CONTROL = 0x1003,           //.isBodyOnly=true
+    BRIG_KIND_DIRECTIVE_EXTENSION = 0x1004,         //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_FBARRIER = 0x1005,
+    BRIG_KIND_DIRECTIVE_FUNCTION = 0x1006,          //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION = 0x1007, //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_KERNEL = 0x1008,            //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_LABEL = 0x1009,             //.isBodyOnly=true
+    BRIG_KIND_DIRECTIVE_LOC = 0x100a,
+    BRIG_KIND_DIRECTIVE_MODULE = 0x100b,            //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_PRAGMA = 0x100c,
+    BRIG_KIND_DIRECTIVE_SIGNATURE = 0x100d,         //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_VARIABLE = 0x100e,
+    BRIG_KIND_DIRECTIVE_END = 0x100f,               //.skip
+
+    BRIG_KIND_INST_BEGIN = 0x2000,                  //.skip
+    BRIG_KIND_INST_ADDR = 0x2000,
+    BRIG_KIND_INST_ATOMIC = 0x2001,
+    BRIG_KIND_INST_BASIC = 0x2002,
+    BRIG_KIND_INST_BR = 0x2003,
+    BRIG_KIND_INST_CMP = 0x2004,
+    BRIG_KIND_INST_CVT = 0x2005,
+    BRIG_KIND_INST_IMAGE = 0x2006,
+    BRIG_KIND_INST_LANE = 0x2007,
+    BRIG_KIND_INST_MEM = 0x2008,
+    BRIG_KIND_INST_MEM_FENCE = 0x2009,
+    BRIG_KIND_INST_MOD = 0x200a,
+    BRIG_KIND_INST_QUERY_IMAGE = 0x200b,
+    BRIG_KIND_INST_QUERY_SAMPLER = 0x200c,
+    BRIG_KIND_INST_QUEUE = 0x200d,
+    BRIG_KIND_INST_SEG = 0x200e,
+    BRIG_KIND_INST_SEG_CVT = 0x200f,
+    BRIG_KIND_INST_SIGNAL = 0x2010,
+    BRIG_KIND_INST_SOURCE_TYPE = 0x2011,
+    BRIG_KIND_INST_END = 0x2012,                    //.skip
+
+    BRIG_KIND_OPERAND_BEGIN = 0x3000,               //.skip
+    BRIG_KIND_OPERAND_ADDRESS = 0x3000,
+    BRIG_KIND_OPERAND_ALIGN = 0x3001,
+    BRIG_KIND_OPERAND_CODE_LIST = 0x3002,
+    BRIG_KIND_OPERAND_CODE_REF = 0x3003,
+    BRIG_KIND_OPERAND_CONSTANT_BYTES = 0x3004,
+    BRIG_KIND_OPERAND_RESERVED = 0x3005, //.skip
+    BRIG_KIND_OPERAND_CONSTANT_IMAGE = 0x3006,
+    BRIG_KIND_OPERAND_CONSTANT_OPERAND_LIST = 0x3007,
+    BRIG_KIND_OPERAND_CONSTANT_SAMPLER = 0x3008,
+    BRIG_KIND_OPERAND_OPERAND_LIST = 0x3009,
+    BRIG_KIND_OPERAND_REGISTER = 0x300a,
+    BRIG_KIND_OPERAND_STRING = 0x300b,
+    BRIG_KIND_OPERAND_WAVESIZE = 0x300c,
+    BRIG_KIND_OPERAND_END = 0x300d                  //.skip
+};
+
+enum BrigAlignment {
+
+    //.mnemo={ s/^BRIG_ALIGNMENT_//; lc }
+    //.mnemo_proto="const char* align2str(unsigned arg)"
+    //
+    //.bytes={ /(\d+)/ ? $1 : undef }
+    //.bytes_switch //.bytes_proto="unsigned align2num(unsigned arg)" //.bytes_default="assert(false); return -1"
+    //
+    //.rbytes=$bytes{ $bytes }
+    //.rbytes_switch //.rbytes_reverse //.rbytes_proto="BrigAlignment num2align(uint64_t arg)"
+    //.rbytes_default="return BRIG_ALIGNMENT_LAST"
+    //
+    //.print=$bytes{ $bytes>1 ? "_align($bytes)" : "" }
+
+    BRIG_ALIGNMENT_NONE = 0,                        //.no_mnemo
+    BRIG_ALIGNMENT_1 = 1,                           //.mnemo=""
+    BRIG_ALIGNMENT_2 = 2,
+    BRIG_ALIGNMENT_4 = 3,
+    BRIG_ALIGNMENT_8 = 4,
+    BRIG_ALIGNMENT_16 = 5,
+    BRIG_ALIGNMENT_32 = 6,
+    BRIG_ALIGNMENT_64 = 7,
+    BRIG_ALIGNMENT_128 = 8,
+    BRIG_ALIGNMENT_256 = 9,
+
+    BRIG_ALIGNMENT_LAST,                            //.skip
+    BRIG_ALIGNMENT_MAX = BRIG_ALIGNMENT_LAST - 1    //.skip
+};
+
+enum BrigAllocation {
+
+    //.mnemo={ s/^BRIG_ALLOCATION_//;lc }
+    //.mnemo_token=EAllocKind
+
+    BRIG_ALLOCATION_NONE = 0,       //.mnemo=""
+    BRIG_ALLOCATION_PROGRAM = 1,
+    BRIG_ALLOCATION_AGENT = 2,
+    BRIG_ALLOCATION_AUTOMATIC = 3
+};
+
+enum BrigAluModifierMask {
+    BRIG_ALU_FTZ = 1
+};
+
+enum BrigAtomicOperation {
+
+    //.tdcaption="Atomic Operations"
+    //
+    //.mnemo={ s/^BRIG_ATOMIC_//;lc }
+    //.mnemo_token=_EMAtomicOp
+    //.mnemo_context=EInstModifierInstAtomicContext
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_ATOMIC_ADD = 0,
+    BRIG_ATOMIC_AND = 1,
+    BRIG_ATOMIC_CAS = 2,
+    BRIG_ATOMIC_EXCH = 3,
+    BRIG_ATOMIC_LD = 4,
+    BRIG_ATOMIC_MAX = 5,
+    BRIG_ATOMIC_MIN = 6,
+    BRIG_ATOMIC_OR = 7,
+    BRIG_ATOMIC_ST = 8,
+    BRIG_ATOMIC_SUB = 9,
+    BRIG_ATOMIC_WRAPDEC = 10,
+    BRIG_ATOMIC_WRAPINC = 11,
+    BRIG_ATOMIC_XOR = 12,
+    BRIG_ATOMIC_WAIT_EQ = 13,
+    BRIG_ATOMIC_WAIT_NE = 14,
+    BRIG_ATOMIC_WAIT_LT = 15,
+    BRIG_ATOMIC_WAIT_GTE = 16,
+    BRIG_ATOMIC_WAITTIMEOUT_EQ = 17,
+    BRIG_ATOMIC_WAITTIMEOUT_NE = 18,
+    BRIG_ATOMIC_WAITTIMEOUT_LT = 19,
+    BRIG_ATOMIC_WAITTIMEOUT_GTE = 20
+};
+
+enum BrigCompareOperation {
+
+    //.tdcaption="Comparison Operators"
+    //
+    //.mnemo={ s/^BRIG_COMPARE_//;lc }
+    //.mnemo_token=_EMCompare
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_COMPARE_EQ = 0,
+    BRIG_COMPARE_NE = 1,
+    BRIG_COMPARE_LT = 2,
+    BRIG_COMPARE_LE = 3,
+    BRIG_COMPARE_GT = 4,
+    BRIG_COMPARE_GE = 5,
+    BRIG_COMPARE_EQU = 6,
+    BRIG_COMPARE_NEU = 7,
+    BRIG_COMPARE_LTU = 8,
+    BRIG_COMPARE_LEU = 9,
+    BRIG_COMPARE_GTU = 10,
+    BRIG_COMPARE_GEU = 11,
+    BRIG_COMPARE_NUM = 12,
+    BRIG_COMPARE_NAN = 13,
+    BRIG_COMPARE_SEQ = 14,
+    BRIG_COMPARE_SNE = 15,
+    BRIG_COMPARE_SLT = 16,
+    BRIG_COMPARE_SLE = 17,
+    BRIG_COMPARE_SGT = 18,
+    BRIG_COMPARE_SGE = 19,
+    BRIG_COMPARE_SGEU = 20,
+    BRIG_COMPARE_SEQU = 21,
+    BRIG_COMPARE_SNEU = 22,
+    BRIG_COMPARE_SLTU = 23,
+    BRIG_COMPARE_SLEU = 24,
+    BRIG_COMPARE_SNUM = 25,
+    BRIG_COMPARE_SNAN = 26,
+    BRIG_COMPARE_SGTU = 27
+};
+
+enum BrigControlDirective {
+
+    //.mnemo={ s/^BRIG_CONTROL_//;lc }
+    //.mnemo_token=EControl
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_CONTROL_NONE = 0, //.skip
+    BRIG_CONTROL_ENABLEBREAKEXCEPTIONS = 1,
+    BRIG_CONTROL_ENABLEDETECTEXCEPTIONS = 2,
+    BRIG_CONTROL_MAXDYNAMICGROUPSIZE = 3,
+    BRIG_CONTROL_MAXFLATGRIDSIZE = 4,
+    BRIG_CONTROL_MAXFLATWORKGROUPSIZE = 5,
+    BRIG_CONTROL_REQUIREDDIM = 6,
+    BRIG_CONTROL_REQUIREDGRIDSIZE = 7,
+    BRIG_CONTROL_REQUIREDWORKGROUPSIZE = 8,
+    BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS = 9
+};
+
+enum BrigExecutableModifierMask {
+    //.nodump
+    BRIG_EXECUTABLE_DEFINITION = 1
+};
+
+enum BrigImageChannelOrder {
+
+    //.mnemo={ s/^BRIG_CHANNEL_ORDER_?//;lc }
+    //.mnemo_token=EImageOrder
+    //.mnemo_context=EImageOrderContext
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_CHANNEL_ORDER_A = 0,
+    BRIG_CHANNEL_ORDER_R = 1,
+    BRIG_CHANNEL_ORDER_RX = 2,
+    BRIG_CHANNEL_ORDER_RG = 3,
+    BRIG_CHANNEL_ORDER_RGX = 4,
+    BRIG_CHANNEL_ORDER_RA = 5,
+    BRIG_CHANNEL_ORDER_RGB = 6,
+    BRIG_CHANNEL_ORDER_RGBX = 7,
+    BRIG_CHANNEL_ORDER_RGBA = 8,
+    BRIG_CHANNEL_ORDER_BGRA = 9,
+    BRIG_CHANNEL_ORDER_ARGB = 10,
+    BRIG_CHANNEL_ORDER_ABGR = 11,
+    BRIG_CHANNEL_ORDER_SRGB = 12,
+    BRIG_CHANNEL_ORDER_SRGBX = 13,
+    BRIG_CHANNEL_ORDER_SRGBA = 14,
+    BRIG_CHANNEL_ORDER_SBGRA = 15,
+    BRIG_CHANNEL_ORDER_INTENSITY = 16,
+    BRIG_CHANNEL_ORDER_LUMINANCE = 17,
+    BRIG_CHANNEL_ORDER_DEPTH = 18,
+    BRIG_CHANNEL_ORDER_DEPTH_STENCIL = 19,
+
+    // used internally
+    BRIG_CHANNEL_ORDER_UNKNOWN, //.mnemo="" // used when no order is specified
+
+    BRIG_CHANNEL_ORDER_FIRST_USER_DEFINED = 128 //.skip
+
+};
+
+enum BrigImageChannelType {
+
+    //.mnemo={ s/^BRIG_CHANNEL_TYPE_//;lc }
+    //.mnemo_token=EImageFormat
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_CHANNEL_TYPE_SNORM_INT8 = 0,
+    BRIG_CHANNEL_TYPE_SNORM_INT16 = 1,
+    BRIG_CHANNEL_TYPE_UNORM_INT8 = 2,
+    BRIG_CHANNEL_TYPE_UNORM_INT16 = 3,
+    BRIG_CHANNEL_TYPE_UNORM_INT24 = 4,
+    BRIG_CHANNEL_TYPE_UNORM_SHORT_555 = 5,
+    BRIG_CHANNEL_TYPE_UNORM_SHORT_565 = 6,
+    BRIG_CHANNEL_TYPE_UNORM_INT_101010 = 7,
+    BRIG_CHANNEL_TYPE_SIGNED_INT8 = 8,
+    BRIG_CHANNEL_TYPE_SIGNED_INT16 = 9,
+    BRIG_CHANNEL_TYPE_SIGNED_INT32 = 10,
+    BRIG_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
+    BRIG_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
+    BRIG_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
+    BRIG_CHANNEL_TYPE_HALF_FLOAT = 14,
+    BRIG_CHANNEL_TYPE_FLOAT = 15,
+
+    // used internally
+    BRIG_CHANNEL_TYPE_UNKNOWN, //.mnemo=""
+
+    BRIG_CHANNEL_TYPE_FIRST_USER_DEFINED = 128 //.skip
+};
+
+enum BrigImageGeometry {
+
+    //.tdcaption="Geometry"
+    //
+    //.mnemo={ s/^BRIG_GEOMETRY_//;lc }
+    //.mnemo_token=EImageGeometry
+    //
+    //.dim={/_([0-9]+D)(A)?/ ? $1+(defined $2?1:0) : undef}
+    //.dim_switch //.dim_proto="unsigned getBrigGeometryDim(unsigned geo)" //.dim_arg="geo"
+    //.dim_default="assert(0); return 0"
+    //
+    //.depth={/DEPTH$/?"true":"false"}
+    //.depth_switch //.depth_proto="bool isBrigGeometryDepth(unsigned geo)" //.depth_arg="geo"
+    //.depth_default="return false"
+
+    BRIG_GEOMETRY_1D = 0,
+    BRIG_GEOMETRY_2D = 1,
+    BRIG_GEOMETRY_3D = 2,
+    BRIG_GEOMETRY_1DA = 3,
+    BRIG_GEOMETRY_2DA = 4,
+    BRIG_GEOMETRY_1DB = 5,
+    BRIG_GEOMETRY_2DDEPTH = 6,
+    BRIG_GEOMETRY_2DADEPTH = 7,
+
+    // used internally
+    BRIG_GEOMETRY_UNKNOWN, //.mnemo=""
+
+    BRIG_GEOMETRY_FIRST_USER_DEFINED = 128 //.skip
+};
+
+enum BrigImageQuery {
+
+    //.mnemo={ s/^BRIG_IMAGE_QUERY_//;lc }
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_IMAGE_QUERY_WIDTH = 0,
+    BRIG_IMAGE_QUERY_HEIGHT = 1,
+    BRIG_IMAGE_QUERY_DEPTH = 2,
+    BRIG_IMAGE_QUERY_ARRAY = 3,
+    BRIG_IMAGE_QUERY_CHANNELORDER = 4,
+    BRIG_IMAGE_QUERY_CHANNELTYPE = 5,
+    BRIG_IMAGE_QUERY_NUMMIPLEVELS = 6
+};
+
+enum BrigLinkage {
+
+    //.mnemo={ s/^BRIG_LINKAGE_//;s/NONE//;lc }
+
+    BRIG_LINKAGE_NONE = 0,
+    BRIG_LINKAGE_PROGRAM = 1,
+    BRIG_LINKAGE_MODULE = 2,
+    BRIG_LINKAGE_FUNCTION = 3,
+    BRIG_LINKAGE_ARG = 4
+};
+
+enum BrigMachineModel {
+
+    //.mnemo={ s/^BRIG_MACHINE_//; '$'.lc }
+    //.mnemo_token=ETargetMachine
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_MACHINE_SMALL = 0,
+    BRIG_MACHINE_LARGE = 1,
+
+    BRIG_MACHINE_UNDEF = 2 //.skip
+};
+
+enum BrigMemoryModifierMask { //.tddef=0
+    BRIG_MEMORY_CONST = 1
+};
+
+enum BrigMemoryOrder {
+
+    //.mnemo={ s/^BRIG_MEMORY_ORDER_//; lc }
+    //.mnemo_token=_EMMemoryOrder
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_MEMORY_ORDER_NONE = 0,                 //.mnemo=""
+    BRIG_MEMORY_ORDER_RELAXED = 1,              //.mnemo=rlx
+    BRIG_MEMORY_ORDER_SC_ACQUIRE = 2,           //.mnemo=scacq
+    BRIG_MEMORY_ORDER_SC_RELEASE = 3,           //.mnemo=screl
+    BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE = 4,   //.mnemo=scar
+
+    BRIG_MEMORY_ORDER_LAST = 5 //.skip
+};
+
+enum BrigMemoryScope {
+
+    //.mnemo={ s/^BRIG_MEMORY_SCOPE_//; lc }
+    //.mnemo_token=_EMMemoryScope
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_MEMORY_SCOPE_NONE = 0,         //.mnemo=""
+    BRIG_MEMORY_SCOPE_WORKITEM = 1,     //.mnemo=""
+    BRIG_MEMORY_SCOPE_WAVEFRONT = 2,    //.mnemo=wave
+    BRIG_MEMORY_SCOPE_WORKGROUP = 3,    //.mnemo=wg
+    BRIG_MEMORY_SCOPE_AGENT = 4,        //.mnemo=agent
+    BRIG_MEMORY_SCOPE_SYSTEM = 5,       //.mnemo=system
+
+    BRIG_MEMORY_SCOPE_LAST = 6 //.skip
+};
+
+enum BrigOpcode {
+
+    //.tdcaption="Instruction Opcodes"
+    //
+    //.k={ "BASIC" }
+    //.pscode=$k{ MACRO2Name("_".$k) }
+    //.opcodeparser=$pscode{ return $pscode && "parseMnemo$pscode" }
+    //.opcodeparser_incfile=ParserUtilities
+    //.opcodeparser_switch //.opcodeparser_proto="OpcodeParser getOpcodeParser(BrigOpcode16_t arg)" //.opcodeparser_default="return parseMnemoBasic"
+    //
+    //.psopnd={undef}
+    //.opndparser=$psopnd{ return $psopnd && "&Parser::parse$psopnd" }
+    //.opndparser_incfile=ParserUtilities
+    //.opndparser_switch //.opndparser_proto="Parser::OperandParser Parser::getOperandParser(BrigOpcode16_t arg)" //.opndparser_default="return &Parser::parseOperands"
+    //
+    //.mnemo={ s/^BRIG_OPCODE_//; s/GCN([^_])/GCN_$1/; lc }
+    //.mnemo_scanner=Instructions //.mnemo_token=EInstruction
+    //.mnemo_context=EDefaultContext
+    //
+    //.has_memory_order={undef}
+    //.semsupport=$has_memory_order{ return $has_memory_order && "true" }
+    //
+    //.hasType=$k{ return ($k and $k eq "BASIC_NO_TYPE") ? "false" : undef; }
+    //.hasType_switch //.hasType_proto="bool instHasType(BrigOpcode16_t arg)" //.hasType_default="return true"
+    //
+    //.opcodevis=$pscode{ s/^BRIG_OPCODE_//; sprintf("%-47s(","vis.visitOpcode_".$_) . ($pscode =~m/^(BasicOrMod|Nop)$/? "inst" : "HSAIL_ASM::Inst". ($pscode=~m/BasicNoType/? "Basic":$pscode) ."(inst)").")" }
+    //.opcodevis_switch //.opcodevis_proto="template <typename RetType, typename Visitor> RetType visitOpcode_gen(HSAIL_ASM::Inst inst, Visitor& vis)"
+    //.opcodevis_arg="inst.opcode()" //.opcodevis_default="return RetType()"
+    //.opcodevis_incfile=ItemUtils
+    //
+    //.ftz=$k{ return ($k eq "BASIC_OR_MOD" or $k eq "CMP" or $k eq "CVT") ? "true" : undef }
+    //.ftz_incfile=ItemUtils //.ftz_switch //.ftz_proto="inline bool instSupportsFtz(BrigOpcode16_t arg)" //.ftz_default="return false"
+    //
+    //.vecOpndIndex={undef}
+    //.vecOpndIndex_switch  //.vecOpndIndex_proto="int vecOpndIndex(BrigOpcode16_t arg)" //.vecOpndIndex_default="return -1"
+    //.vecOpndIndex_incfile=ParserUtilities
+    //
+    //.numdst={undef}
+    //.numdst_switch //.numdst_proto="int instNumDstOperands(BrigOpcode16_t arg)" //.numdst_default="return 1"
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_OPCODE_NOP = 0,                    //.k=NOP            //.hasType=false
+    BRIG_OPCODE_ABS = 1,                    //.k=BASIC_OR_MOD
+    BRIG_OPCODE_ADD = 2,                    //.k=BASIC_OR_MOD
+    BRIG_OPCODE_BORROW = 3,
+    BRIG_OPCODE_CARRY = 4,
+    BRIG_OPCODE_CEIL = 5,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_COPYSIGN = 6,               //.k=BASIC_OR_MOD
+    BRIG_OPCODE_DIV = 7,                    //.k=BASIC_OR_MOD
+    BRIG_OPCODE_FLOOR = 8,                  //.k=BASIC_OR_MOD
+    BRIG_OPCODE_FMA = 9,                    //.k=BASIC_OR_MOD
+    BRIG_OPCODE_FRACT = 10,                 //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MAD = 11,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MAX = 12,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MIN = 13,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MUL = 14,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MULHI = 15,                 //.k=BASIC_OR_MOD
+    BRIG_OPCODE_NEG = 16,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_REM = 17,
+    BRIG_OPCODE_RINT = 18,                  //.k=BASIC_OR_MOD
+    BRIG_OPCODE_SQRT = 19,                  //.k=BASIC_OR_MOD
+    BRIG_OPCODE_SUB = 20,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_TRUNC = 21,                 //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MAD24 = 22,
+    BRIG_OPCODE_MAD24HI = 23,
+    BRIG_OPCODE_MUL24 = 24,
+    BRIG_OPCODE_MUL24HI = 25,
+    BRIG_OPCODE_SHL = 26,
+    BRIG_OPCODE_SHR = 27,
+    BRIG_OPCODE_AND = 28,
+    BRIG_OPCODE_NOT = 29,
+    BRIG_OPCODE_OR = 30,
+    BRIG_OPCODE_POPCOUNT = 31,              //.k=SOURCE_TYPE
+    BRIG_OPCODE_XOR = 32,
+    BRIG_OPCODE_BITEXTRACT = 33,
+    BRIG_OPCODE_BITINSERT = 34,
+    BRIG_OPCODE_BITMASK = 35,
+    BRIG_OPCODE_BITREV = 36,
+    BRIG_OPCODE_BITSELECT = 37,
+    BRIG_OPCODE_FIRSTBIT = 38,              //.k=SOURCE_TYPE
+    BRIG_OPCODE_LASTBIT = 39,               //.k=SOURCE_TYPE
+    BRIG_OPCODE_COMBINE = 40,               //.k=SOURCE_TYPE    //.vecOpndIndex=1
+    BRIG_OPCODE_EXPAND = 41,                //.k=SOURCE_TYPE    //.vecOpndIndex=0
+    BRIG_OPCODE_LDA = 42,                   //.k=ADDR
+    BRIG_OPCODE_MOV = 43,
+    BRIG_OPCODE_SHUFFLE = 44,
+    BRIG_OPCODE_UNPACKHI = 45,
+    BRIG_OPCODE_UNPACKLO = 46,
+    BRIG_OPCODE_PACK = 47,                  //.k=SOURCE_TYPE
+    BRIG_OPCODE_UNPACK = 48,                //.k=SOURCE_TYPE
+    BRIG_OPCODE_CMOV = 49,
+    BRIG_OPCODE_CLASS = 50,                 //.k=SOURCE_TYPE
+    BRIG_OPCODE_NCOS = 51,
+    BRIG_OPCODE_NEXP2 = 52,
+    BRIG_OPCODE_NFMA = 53,
+    BRIG_OPCODE_NLOG2 = 54,
+    BRIG_OPCODE_NRCP = 55,
+    BRIG_OPCODE_NRSQRT = 56,
+    BRIG_OPCODE_NSIN = 57,
+    BRIG_OPCODE_NSQRT = 58,
+    BRIG_OPCODE_BITALIGN = 59,
+    BRIG_OPCODE_BYTEALIGN = 60,
+    BRIG_OPCODE_PACKCVT = 61,               //.k=SOURCE_TYPE
+    BRIG_OPCODE_UNPACKCVT = 62,             //.k=SOURCE_TYPE
+    BRIG_OPCODE_LERP = 63,
+    BRIG_OPCODE_SAD = 64,                   //.k=SOURCE_TYPE
+    BRIG_OPCODE_SADHI = 65,                 //.k=SOURCE_TYPE
+    BRIG_OPCODE_SEGMENTP = 66,              //.k=SEG_CVT
+    BRIG_OPCODE_FTOS = 67,                  //.k=SEG_CVT
+    BRIG_OPCODE_STOF = 68,                  //.k=SEG_CVT
+    BRIG_OPCODE_CMP = 69,                   //.k=CMP
+    BRIG_OPCODE_CVT = 70,                   //.k=CVT
+    BRIG_OPCODE_LD = 71,                    //.k=MEM            //.has_memory_order //.vecOpndIndex=0
+    BRIG_OPCODE_ST = 72,                    //.k=MEM            //.has_memory_order //.vecOpndIndex=0 //.numdst=0
+    BRIG_OPCODE_ATOMIC = 73,                //.k=ATOMIC
+    BRIG_OPCODE_ATOMICNORET = 74,           //.k=ATOMIC         //.numdst=0
+    BRIG_OPCODE_SIGNAL = 75,                //.k=SIGNAL
+    BRIG_OPCODE_SIGNALNORET = 76,           //.k=SIGNAL         //.numdst=0
+    BRIG_OPCODE_MEMFENCE = 77,              //.k=MEM_FENCE      //.numdst=0
+    BRIG_OPCODE_RDIMAGE = 78,               //.k=IMAGE          //.vecOpndIndex=0
+    BRIG_OPCODE_LDIMAGE = 79,               //.k=IMAGE          //.vecOpndIndex=0
+    BRIG_OPCODE_STIMAGE = 80,               //.k=IMAGE          //.vecOpndIndex=0 //.numdst=0
+    BRIG_OPCODE_IMAGEFENCE = 81,            //.k=BASIC_NO_TYPE
+    BRIG_OPCODE_QUERYIMAGE = 82,            //.k=QUERY_IMAGE
+    BRIG_OPCODE_QUERYSAMPLER = 83,          //.k=QUERY_SAMPLER
+    BRIG_OPCODE_CBR = 84,                   //.k=BR             //.numdst=0
+    BRIG_OPCODE_BR = 85,                    //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_SBR = 86,                   //.k=BR             //.numdst=0     //.psopnd=SbrOperands
+    BRIG_OPCODE_BARRIER = 87,               //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_WAVEBARRIER = 88,           //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_ARRIVEFBAR = 89,            //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_INITFBAR = 90,              //.k=BASIC_NO_TYPE  //.numdst=0     //.hasType=false
+    BRIG_OPCODE_JOINFBAR = 91,              //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_LEAVEFBAR = 92,             //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_RELEASEFBAR = 93,           //.k=BASIC_NO_TYPE  //.numdst=0
+    BRIG_OPCODE_WAITFBAR = 94,              //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_LDF = 95,
+    BRIG_OPCODE_ACTIVELANECOUNT = 96,       //.k=LANE
+    BRIG_OPCODE_ACTIVELANEID = 97,          //.k=LANE
+    BRIG_OPCODE_ACTIVELANEMASK = 98,        //.k=LANE           //.vecOpndIndex=0
+    BRIG_OPCODE_ACTIVELANEPERMUTE = 99,     //.k=LANE
+    BRIG_OPCODE_CALL = 100,                 //.k=BR             //.psopnd=CallOperands //.numdst=0 //.hasType=false
+    BRIG_OPCODE_SCALL = 101,                //.k=BR             //.psopnd=CallOperands //.numdst=0
+    BRIG_OPCODE_ICALL = 102,                //.k=BR             //.psopnd=CallOperands //.numdst=0
+    BRIG_OPCODE_RET = 103,                  //.k=BASIC_NO_TYPE
+    BRIG_OPCODE_ALLOCA = 104,               //.k=MEM
+    BRIG_OPCODE_CURRENTWORKGROUPSIZE = 105,
+    BRIG_OPCODE_CURRENTWORKITEMFLATID = 106,
+    BRIG_OPCODE_DIM = 107,
+    BRIG_OPCODE_GRIDGROUPS = 108,
+    BRIG_OPCODE_GRIDSIZE = 109,
+    BRIG_OPCODE_PACKETCOMPLETIONSIG = 110,
+    BRIG_OPCODE_PACKETID = 111,
+    BRIG_OPCODE_WORKGROUPID = 112,
+    BRIG_OPCODE_WORKGROUPSIZE = 113,
+    BRIG_OPCODE_WORKITEMABSID = 114,
+    BRIG_OPCODE_WORKITEMFLATABSID = 115,
+    BRIG_OPCODE_WORKITEMFLATID = 116,
+    BRIG_OPCODE_WORKITEMID = 117,
+    BRIG_OPCODE_CLEARDETECTEXCEPT = 118,    //.numdst=0
+    BRIG_OPCODE_GETDETECTEXCEPT = 119,
+    BRIG_OPCODE_SETDETECTEXCEPT = 120,      //.numdst=0
+    BRIG_OPCODE_ADDQUEUEWRITEINDEX = 121,   //.k=QUEUE
+    BRIG_OPCODE_CASQUEUEWRITEINDEX = 122,   //.k=QUEUE
+    BRIG_OPCODE_LDQUEUEREADINDEX = 123,     //.k=QUEUE
+    BRIG_OPCODE_LDQUEUEWRITEINDEX = 124,    //.k=QUEUE
+    BRIG_OPCODE_STQUEUEREADINDEX = 125,     //.k=QUEUE      //.numdst=0
+    BRIG_OPCODE_STQUEUEWRITEINDEX = 126,    //.k=QUEUE      //.numdst=0
+    BRIG_OPCODE_CLOCK = 127,
+    BRIG_OPCODE_CUID = 128,
+    BRIG_OPCODE_DEBUGTRAP = 129,            //.numdst=0
+    BRIG_OPCODE_GROUPBASEPTR = 130,
+    BRIG_OPCODE_KERNARGBASEPTR = 131,
+    BRIG_OPCODE_LANEID = 132,
+    BRIG_OPCODE_MAXCUID = 133,
+    BRIG_OPCODE_MAXWAVEID = 134,
+    BRIG_OPCODE_NULLPTR = 135,              //.k=SEG
+    BRIG_OPCODE_WAVEID = 136,
+    BRIG_OPCODE_FIRST_USER_DEFINED = 32768, //.skip
+
+    BRIG_OPCODE_GCNMADU = (1u << 15) | 0,           //.k=BASIC_NO_TYPE
+    BRIG_OPCODE_GCNMADS = (1u << 15) | 1,           //.k=BASIC_NO_TYPE
+    BRIG_OPCODE_GCNMAX3 = (1u << 15) | 2,
+    BRIG_OPCODE_GCNMIN3 = (1u << 15) | 3,
+    BRIG_OPCODE_GCNMED3 = (1u << 15) | 4,
+    BRIG_OPCODE_GCNFLDEXP = (1u << 15) | 5,         //.k=BASIC_OR_MOD
+    BRIG_OPCODE_GCNFREXP_EXP = (1u << 15) | 6,      //.k=BASIC_OR_MOD
+    BRIG_OPCODE_GCNFREXP_MANT = (1u << 15) | 7,     //.k=BASIC_OR_MOD
+    BRIG_OPCODE_GCNTRIG_PREOP = (1u << 15) | 8,     //.k=BASIC_OR_MOD
+    BRIG_OPCODE_GCNBFM = (1u << 15) | 9,
+    BRIG_OPCODE_GCNLD = (1u << 15) | 10,            //.k=MEM            //.has_memory_order //.vecOpndIndex=0
+    BRIG_OPCODE_GCNST = (1u << 15) | 11,            //.k=MEM            //.has_memory_order //.vecOpndIndex=0
+    BRIG_OPCODE_GCNATOMIC = (1u << 15) | 12,        //.k=ATOMIC
+    BRIG_OPCODE_GCNATOMICNORET = (1u << 15) | 13,   //.k=ATOMIC         //.mnemo=gcn_atomicNoRet
+    BRIG_OPCODE_GCNSLEEP = (1u << 15) | 14,
+    BRIG_OPCODE_GCNPRIORITY = (1u << 15) | 15,
+    BRIG_OPCODE_GCNREGIONALLOC = (1u << 15) | 16,   //.k=BASIC_NO_TYPE //.mnemo=gcn_region_alloc
+    BRIG_OPCODE_GCNMSAD = (1u << 15) | 17,
+    BRIG_OPCODE_GCNQSAD = (1u << 15) | 18,
+    BRIG_OPCODE_GCNMQSAD = (1u << 15) | 19,
+    BRIG_OPCODE_GCNMQSAD4 = (1u << 15) | 20,        //.k=BASIC_NO_TYPE
+    BRIG_OPCODE_GCNSADW = (1u << 15) | 21,
+    BRIG_OPCODE_GCNSADD = (1u << 15) | 22,
+    BRIG_OPCODE_GCNCONSUME = (1u << 15) | 23,       //.k=ADDR           //.mnemo=gcn_atomic_consume
+    BRIG_OPCODE_GCNAPPEND = (1u << 15) | 24,        //.k=ADDR           //.mnemo=gcn_atomic_append
+    BRIG_OPCODE_GCNB4XCHG = (1u << 15) | 25,        //.mnemo=gcn_b4xchg
+    BRIG_OPCODE_GCNB32XCHG = (1u << 15) | 26,       //.mnemo=gcn_b32xchg
+    BRIG_OPCODE_GCNMAX = (1u << 15) | 27,
+    BRIG_OPCODE_GCNMIN = (1u << 15) | 28,
+    BRIG_OPCODE_GCNDIVRELAXED = (1u << 15) | 29,    //.k=BASIC_OR_MOD
+    BRIG_OPCODE_GCNDIVRELAXEDNARROW = (1u << 15) | 30,
+
+    BRIG_OPCODE_AMDRDIMAGELOD  = (1u << 15) | 31,    //.k=IMAGE //.mnemo=amd_rdimagelod  //.vecOpndIndex=0
+    BRIG_OPCODE_AMDRDIMAGEGRAD = (1u << 15) | 32,    //.k=IMAGE //.mnemo=amd_rdimagegrad //.vecOpndIndex=0
+    BRIG_OPCODE_AMDLDIMAGEMIP  = (1u << 15) | 33,    //.k=IMAGE //.mnemo=amd_ldimagemip //.vecOpndIndex=0
+    BRIG_OPCODE_AMDSTIMAGEMIP  = (1u << 15) | 34,    //.k=IMAGE //.mnemo=amd_stimagemip //.vecOpndIndex=0 //.numdst=0
+    BRIG_OPCODE_AMDQUERYIMAGE  = (1u << 15) | 35     //.k=QUERY_IMAGE //.mnemo=amd_queryimage
+};
+
+enum BrigPack {
+
+    //.tdcaption="Packing"
+    //
+    //.mnemo={ s/^BRIG_PACK_//;s/SAT$/_sat/;lc }
+    //.mnemo_token=_EMPacking
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_PACK_NONE = 0, //.mnemo=""
+    BRIG_PACK_PP = 1,
+    BRIG_PACK_PS = 2,
+    BRIG_PACK_SP = 3,
+    BRIG_PACK_SS = 4,
+    BRIG_PACK_S = 5,
+    BRIG_PACK_P = 6,
+    BRIG_PACK_PPSAT = 7,
+    BRIG_PACK_PSSAT = 8,
+    BRIG_PACK_SPSAT = 9,
+    BRIG_PACK_SSSAT = 10,
+    BRIG_PACK_SSAT = 11,
+    BRIG_PACK_PSAT = 12
+};
+
+enum BrigProfile {
+
+    //.mnemo={ s/^BRIG_PROFILE_//;'$'.lc }
+    //.mnemo_token=ETargetProfile
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_PROFILE_BASE = 0,
+    BRIG_PROFILE_FULL = 1,
+
+    BRIG_PROFILE_UNDEF = 2 //.skip
+};
+
+enum BrigRegisterKind {
+
+    //.mnemo={ s/^BRIG_REGISTER_KIND_//;'$'.lc(substr($_,0,1)) }
+    //
+    //.bits={ }
+    //.bits_switch //.bits_proto="unsigned getRegBits(BrigRegisterKind16_t arg)" //.bits_default="return (unsigned)-1"
+    //
+    //.nollvm
+
+    BRIG_REGISTER_KIND_CONTROL = 0, //.bits=1
+    BRIG_REGISTER_KIND_SINGLE = 1,  //.bits=32
+    BRIG_REGISTER_KIND_DOUBLE = 2,  //.bits=64
+    BRIG_REGISTER_KIND_QUAD = 3     //.bits=128
+};
+
+enum BrigRound {
+
+    //.mnemo={}
+    //.mnemo_fn=round2str //.mnemo_token=_EMRound
+    //
+    //.sat={/_SAT$/? "true" : "false"}
+    //.sat_switch //.sat_proto="bool isSatRounding(unsigned rounding)" //.sat_arg="rounding"
+    //.sat_default="return false"
+    //
+    //.sig={/_SIGNALING_/? "true" : "false"}
+    //.sig_switch //.sig_proto="bool isSignalingRounding(unsigned rounding)" //.sig_arg="rounding"
+    //.sig_default="return false"
+    //
+    //.int={/_INTEGER_/? "true" : "false"}
+    //.int_switch //.int_proto="bool isIntRounding(unsigned rounding)" //.int_arg="rounding"
+    //.int_default="return false"
+    //
+    //.flt={/_FLOAT_/? "true" : "false"}
+    //.flt_switch //.flt_proto="bool isFloatRounding(unsigned rounding)" //.flt_arg="rounding"
+    //.flt_default="return false"
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_ROUND_NONE = 0,                                    //.no_mnemo
+    BRIG_ROUND_FLOAT_DEFAULT = 1,                           //.no_mnemo
+    BRIG_ROUND_FLOAT_NEAR_EVEN = 2,                         //.mnemo=near
+    BRIG_ROUND_FLOAT_ZERO = 3,                              //.mnemo=zero
+    BRIG_ROUND_FLOAT_PLUS_INFINITY = 4,                     //.mnemo=up
+    BRIG_ROUND_FLOAT_MINUS_INFINITY = 5,                    //.mnemo=down
+    BRIG_ROUND_INTEGER_NEAR_EVEN = 6,                       //.mnemo=neari
+    BRIG_ROUND_INTEGER_ZERO = 7,                            //.mnemo=zeroi
+    BRIG_ROUND_INTEGER_PLUS_INFINITY = 8,                   //.mnemo=upi
+    BRIG_ROUND_INTEGER_MINUS_INFINITY = 9,                  //.mnemo=downi
+    BRIG_ROUND_INTEGER_NEAR_EVEN_SAT = 10,                  //.mnemo=neari_sat
+    BRIG_ROUND_INTEGER_ZERO_SAT = 11,                       //.mnemo=zeroi_sat
+    BRIG_ROUND_INTEGER_PLUS_INFINITY_SAT = 12,              //.mnemo=upi_sat
+    BRIG_ROUND_INTEGER_MINUS_INFINITY_SAT = 13,             //.mnemo=downi_sat
+    BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN = 14,            //.mnemo=sneari
+    BRIG_ROUND_INTEGER_SIGNALING_ZERO = 15,                 //.mnemo=szeroi
+    BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY = 16,        //.mnemo=supi
+    BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY = 17,       //.mnemo=sdowni
+    BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN_SAT = 18,        //.mnemo=sneari_sat
+    BRIG_ROUND_INTEGER_SIGNALING_ZERO_SAT = 19,             //.mnemo=szeroi_sat
+    BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY_SAT = 20,    //.mnemo=supi_sat
+    BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY_SAT = 21    //.mnemo=sdowni_sat
+};
+
+enum BrigSamplerAddressing {
+
+    //.mnemo={ s/^BRIG_ADDRESSING_//;lc }
+    //.mnemo_token=ESamplerAddressingMode
+
+    BRIG_ADDRESSING_UNDEFINED = 0,
+    BRIG_ADDRESSING_CLAMP_TO_EDGE = 1,
+    BRIG_ADDRESSING_CLAMP_TO_BORDER = 2,
+    BRIG_ADDRESSING_REPEAT = 3,
+    BRIG_ADDRESSING_MIRRORED_REPEAT = 4,
+
+    BRIG_ADDRESSING_FIRST_USER_DEFINED = 128 //.skip
+};
+
+enum BrigSamplerCoordNormalization {
+
+    //.mnemo={ s/^BRIG_COORD_//;lc }
+    //.mnemo_token=ESamplerCoord
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_COORD_UNNORMALIZED = 0,
+    BRIG_COORD_NORMALIZED = 1
+};
+
+enum BrigSamplerFilter {
+
+    //.mnemo={ s/^BRIG_FILTER_//;lc }
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_FILTER_NEAREST = 0,
+    BRIG_FILTER_LINEAR = 1,
+
+    BRIG_FILTER_FIRST_USER_DEFINED = 128 //.skip
+};
+
+enum BrigSamplerQuery {
+
+    //.mnemo={ s/^BRIG_SAMPLER_QUERY_//;lc }
+    //.mnemo_token=_EMSamplerQuery
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_SAMPLER_QUERY_ADDRESSING = 0,
+    BRIG_SAMPLER_QUERY_COORD = 1,
+    BRIG_SAMPLER_QUERY_FILTER = 2
+};
+
+enum BrigSectionIndex {
+
+    //.nollvm
+    //
+    //.mnemo={ s/^BRIG_SECTION_INDEX_/HSA_/;lc }
+
+    BRIG_SECTION_INDEX_DATA = 0,
+    BRIG_SECTION_INDEX_CODE = 1,
+    BRIG_SECTION_INDEX_OPERAND = 2,
+    BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED = 3,
+
+    // used internally
+    BRIG_SECTION_INDEX_IMPLEMENTATION_DEFINED = BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED //.skip
+};
+
+enum BrigSegCvtModifierMask {
+    BRIG_SEG_CVT_NONULL = 1         //.mnemo="nonull" //.print="_nonull"
+};
+
+enum BrigSegment {
+
+    //.mnemo={ s/^BRIG_SEGMENT_//;lc}
+    //.mnemo_token=_EMSegment
+    //.mnemo_context=EInstModifierContext
+    //
+    //.print=$mnemo{ $mnemo ? "_$mnemo" : "" }
+
+    BRIG_SEGMENT_NONE = 0, //.mnemo=""
+    BRIG_SEGMENT_FLAT = 1, //.mnemo=""
+    BRIG_SEGMENT_GLOBAL = 2,
+    BRIG_SEGMENT_READONLY = 3,
+    BRIG_SEGMENT_KERNARG = 4,
+    BRIG_SEGMENT_GROUP = 5,
+    BRIG_SEGMENT_PRIVATE = 6,
+    BRIG_SEGMENT_SPILL = 7,
+    BRIG_SEGMENT_ARG = 8,
+
+    BRIG_SEGMENT_FIRST_USER_DEFINED = 128, //.skip
+
+    BRIG_SEGMENT_AMD_GCN = 9, //.mnemo="region"
+};
+
+enum BrigPackedTypeBits {
+
+    //.nodump
+    //
+    //.nollvm
+
+    BRIG_TYPE_BASE_SIZE  = 5,
+    BRIG_TYPE_PACK_SIZE  = 2,
+    BRIG_TYPE_ARRAY_SIZE = 1,
+
+    BRIG_TYPE_BASE_SHIFT  = 0,
+    BRIG_TYPE_PACK_SHIFT  = BRIG_TYPE_BASE_SHIFT + BRIG_TYPE_BASE_SIZE,
+    BRIG_TYPE_ARRAY_SHIFT = BRIG_TYPE_PACK_SHIFT + BRIG_TYPE_PACK_SIZE,
+
+    BRIG_TYPE_BASE_MASK  = ((1 << BRIG_TYPE_BASE_SIZE)  - 1) << BRIG_TYPE_BASE_SHIFT,
+    BRIG_TYPE_PACK_MASK  = ((1 << BRIG_TYPE_PACK_SIZE)  - 1) << BRIG_TYPE_PACK_SHIFT,
+    BRIG_TYPE_ARRAY_MASK = ((1 << BRIG_TYPE_ARRAY_SIZE) - 1) << BRIG_TYPE_ARRAY_SHIFT,
+
+    BRIG_TYPE_PACK_NONE = 0 << BRIG_TYPE_PACK_SHIFT,
+    BRIG_TYPE_PACK_32   = 1 << BRIG_TYPE_PACK_SHIFT,
+    BRIG_TYPE_PACK_64   = 2 << BRIG_TYPE_PACK_SHIFT,
+    BRIG_TYPE_PACK_128  = 3 << BRIG_TYPE_PACK_SHIFT,
+
+    BRIG_TYPE_ARRAY     = 1 << BRIG_TYPE_ARRAY_SHIFT
+};
+
+enum BrigType {
+
+    //.numBits={ /ARRAY$/ ? undef : /([0-9]+)X([0-9]+)/ ? $1*$2 : /([0-9]+)/ ? $1 : undef }
+    //.numBits_switch //.numBits_proto="unsigned getBrigTypeNumBits(unsigned arg)" //.numBits_default="assert(0); return 0"
+    //.numBytes=$numBits{ $numBits > 1 ? $numBits/8 : undef }
+    //.numBytes_switch //.numBytes_proto="unsigned getBrigTypeNumBytes(unsigned arg)" //.numBytes_default="assert(0); return 0"
+    //
+    //.mnemo={ s/^BRIG_TYPE_//;lc }
+    //.mnemo_token=_EMType
+    //
+    //.array={/ARRAY$/?"true":"false"}
+    //.array_switch //.array_proto="bool isArrayType(unsigned type)" //.array_arg="type"
+    //.array_default="return false"
+    //
+    //.a2e={/(.*)_ARRAY$/? $1 : "BRIG_TYPE_NONE"}
+    //.a2e_switch //.a2e_proto="unsigned arrayType2elementType(unsigned type)" //.a2e_arg="type"
+    //.a2e_default="return BRIG_TYPE_NONE"
+    //
+    //.e2a={/_ARRAY$/? "BRIG_TYPE_NONE" : /_NONE$/ ? "BRIG_TYPE_NONE" : /_B1$/ ? "BRIG_TYPE_NONE" : $_ . "_ARRAY"}
+    //.e2a_switch //.e2a_proto="unsigned elementType2arrayType(unsigned type)" //.e2a_arg="type"
+    //.e2a_default="return BRIG_TYPE_NONE"
+    //
+    //.t2s={s/^BRIG_TYPE_//;lc s/_ARRAY$/[]/;lc}
+    //.t2s_switch //.t2s_proto="const char* type2name(unsigned type)" //.t2s_arg="type"
+    //.t2s_default="return NULL"
+    //
+    //.dispatch_switch //.dispatch_incfile=TemplateUtilities
+    //.dispatch_proto="template<typename RetType, typename Visitor>\nRetType dispatchByType_gen(unsigned type, Visitor& v)"
+    //.dispatch={ /ARRAY$/ ? "v.visitNone(type)" : /^BRIG_TYPE_([BUSF]|SIG)[0-9]+/ ? "v.template visit< BrigTypeTraits<$_> >()" : "v.visitNone(type)" }
+    //.dispatch_arg="type" //.dispatch_default="return v.visitNone(type)"
+    //
+    //- .tdname=BrigType
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_TYPE_NONE  = 0,  //.mnemo=""       //.print=""
+    BRIG_TYPE_U8    = 1,  //.ctype=uint8_t
+    BRIG_TYPE_U16   = 2,  //.ctype=uint16_t
+    BRIG_TYPE_U32   = 3,  //.ctype=uint32_t
+    BRIG_TYPE_U64   = 4,  //.ctype=uint64_t
+    BRIG_TYPE_S8    = 5,  //.ctype=int8_t
+    BRIG_TYPE_S16   = 6,  //.ctype=int16_t
+    BRIG_TYPE_S32   = 7,  //.ctype=int32_t
+    BRIG_TYPE_S64   = 8,  //.ctype=int64_t
+    BRIG_TYPE_F16   = 9,  //.ctype=f16_t
+    BRIG_TYPE_F32   = 10, //.ctype=float
+    BRIG_TYPE_F64   = 11, //.ctype=double
+    BRIG_TYPE_B1    = 12, //.ctype=bool     //.numBytes=1
+    BRIG_TYPE_B8    = 13, //.ctype=uint8_t
+    BRIG_TYPE_B16   = 14, //.ctype=uint16_t
+    BRIG_TYPE_B32   = 15, //.ctype=uint32_t
+    BRIG_TYPE_B64   = 16, //.ctype=uint64_t
+    BRIG_TYPE_B128  = 17, //.ctype=b128_t
+    BRIG_TYPE_SAMP  = 18, //.mnemo=samp     //.numBits=64
+    BRIG_TYPE_ROIMG = 19, //.mnemo=roimg    //.numBits=64
+    BRIG_TYPE_WOIMG = 20, //.mnemo=woimg    //.numBits=64
+    BRIG_TYPE_RWIMG = 21, //.mnemo=rwimg    //.numBits=64
+    BRIG_TYPE_SIG32 = 22, //.mnemo=sig32    //.numBits=64
+    BRIG_TYPE_SIG64 = 23, //.mnemo=sig64    //.numBits=64
+
+    BRIG_TYPE_U8X4  = BRIG_TYPE_U8  | BRIG_TYPE_PACK_32,  //.ctype=uint8_t
+    BRIG_TYPE_U8X8  = BRIG_TYPE_U8  | BRIG_TYPE_PACK_64,  //.ctype=uint8_t
+    BRIG_TYPE_U8X16 = BRIG_TYPE_U8  | BRIG_TYPE_PACK_128, //.ctype=uint8_t
+    BRIG_TYPE_U16X2 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_32,  //.ctype=uint16_t
+    BRIG_TYPE_U16X4 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_64,  //.ctype=uint16_t
+    BRIG_TYPE_U16X8 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_128, //.ctype=uint16_t
+    BRIG_TYPE_U32X2 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_64,  //.ctype=uint32_t
+    BRIG_TYPE_U32X4 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_128, //.ctype=uint32_t
+    BRIG_TYPE_U64X2 = BRIG_TYPE_U64 | BRIG_TYPE_PACK_128, //.ctype=uint64_t
+    BRIG_TYPE_S8X4  = BRIG_TYPE_S8  | BRIG_TYPE_PACK_32,  //.ctype=int8_t
+    BRIG_TYPE_S8X8  = BRIG_TYPE_S8  | BRIG_TYPE_PACK_64,  //.ctype=int8_t
+    BRIG_TYPE_S8X16 = BRIG_TYPE_S8  | BRIG_TYPE_PACK_128, //.ctype=int8_t
+    BRIG_TYPE_S16X2 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_32,  //.ctype=int16_t
+    BRIG_TYPE_S16X4 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_64,  //.ctype=int16_t
+    BRIG_TYPE_S16X8 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_128, //.ctype=int16_t
+    BRIG_TYPE_S32X2 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_64,  //.ctype=int32_t
+    BRIG_TYPE_S32X4 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_128, //.ctype=int32_t
+    BRIG_TYPE_S64X2 = BRIG_TYPE_S64 | BRIG_TYPE_PACK_128, //.ctype=int64_t
+    BRIG_TYPE_F16X2 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_32,  //.ctype=f16_t
+    BRIG_TYPE_F16X4 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_64,  //.ctype=f16_t
+    BRIG_TYPE_F16X8 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_128, //.ctype=f16_t
+    BRIG_TYPE_F32X2 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_64,  //.ctype=float
+    BRIG_TYPE_F32X4 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_128, //.ctype=float
+    BRIG_TYPE_F64X2 = BRIG_TYPE_F64 | BRIG_TYPE_PACK_128, //.ctype=double
+
+    BRIG_TYPE_U8_ARRAY    = BRIG_TYPE_U8    | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U16_ARRAY   = BRIG_TYPE_U16   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U32_ARRAY   = BRIG_TYPE_U32   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U64_ARRAY   = BRIG_TYPE_U64   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S8_ARRAY    = BRIG_TYPE_S8    | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S16_ARRAY   = BRIG_TYPE_S16   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S32_ARRAY   = BRIG_TYPE_S32   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S64_ARRAY   = BRIG_TYPE_S64   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F16_ARRAY   = BRIG_TYPE_F16   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F32_ARRAY   = BRIG_TYPE_F32   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F64_ARRAY   = BRIG_TYPE_F64   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_B8_ARRAY    = BRIG_TYPE_B8    | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_B16_ARRAY   = BRIG_TYPE_B16   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_B32_ARRAY   = BRIG_TYPE_B32   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_B64_ARRAY   = BRIG_TYPE_B64   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_B128_ARRAY  = BRIG_TYPE_B128  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_SAMP_ARRAY  = BRIG_TYPE_SAMP  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_ROIMG_ARRAY = BRIG_TYPE_ROIMG | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_WOIMG_ARRAY = BRIG_TYPE_WOIMG | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_RWIMG_ARRAY = BRIG_TYPE_RWIMG | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_SIG32_ARRAY = BRIG_TYPE_SIG32 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_SIG64_ARRAY = BRIG_TYPE_SIG64 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U8X4_ARRAY  = BRIG_TYPE_U8X4  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U8X8_ARRAY  = BRIG_TYPE_U8X8  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U8X16_ARRAY = BRIG_TYPE_U8X16 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U16X2_ARRAY = BRIG_TYPE_U16X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U16X4_ARRAY = BRIG_TYPE_U16X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U16X8_ARRAY = BRIG_TYPE_U16X8 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U32X2_ARRAY = BRIG_TYPE_U32X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U32X4_ARRAY = BRIG_TYPE_U32X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U64X2_ARRAY = BRIG_TYPE_U64X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S8X4_ARRAY  = BRIG_TYPE_S8X4  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S8X8_ARRAY  = BRIG_TYPE_S8X8  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S8X16_ARRAY = BRIG_TYPE_S8X16 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S16X2_ARRAY = BRIG_TYPE_S16X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S16X4_ARRAY = BRIG_TYPE_S16X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S16X8_ARRAY = BRIG_TYPE_S16X8 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S32X2_ARRAY = BRIG_TYPE_S32X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S32X4_ARRAY = BRIG_TYPE_S32X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S64X2_ARRAY = BRIG_TYPE_S64X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F16X2_ARRAY = BRIG_TYPE_F16X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F16X4_ARRAY = BRIG_TYPE_F16X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F16X8_ARRAY = BRIG_TYPE_F16X8 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F32X2_ARRAY = BRIG_TYPE_F32X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F32X4_ARRAY = BRIG_TYPE_F32X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F64X2_ARRAY = BRIG_TYPE_F64X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+
+    // Used internally
+    BRIG_TYPE_INVALID = (unsigned) -1 //.skip
+};
+
+enum BrigVariableModifierMask {
+
+    //.nodump
+
+    BRIG_VARIABLE_DEFINITION = 1,
+    BRIG_VARIABLE_CONST = 2
+};
+
+enum BrigWidth {
+
+    //.tddef=1
+    //
+    //.print={ s/^BRIG_WIDTH_//; "_width($_)" }
+
+    BRIG_WIDTH_NONE = 0,
+    BRIG_WIDTH_1 = 1,
+    BRIG_WIDTH_2 = 2,
+    BRIG_WIDTH_4 = 3,
+    BRIG_WIDTH_8 = 4,
+    BRIG_WIDTH_16 = 5,
+    BRIG_WIDTH_32 = 6,
+    BRIG_WIDTH_64 = 7,
+    BRIG_WIDTH_128 = 8,
+    BRIG_WIDTH_256 = 9,
+    BRIG_WIDTH_512 = 10,
+    BRIG_WIDTH_1024 = 11,
+    BRIG_WIDTH_2048 = 12,
+    BRIG_WIDTH_4096 = 13,
+    BRIG_WIDTH_8192 = 14,
+    BRIG_WIDTH_16384 = 15,
+    BRIG_WIDTH_32768 = 16,
+    BRIG_WIDTH_65536 = 17,
+    BRIG_WIDTH_131072 = 18,
+    BRIG_WIDTH_262144 = 19,
+    BRIG_WIDTH_524288 = 20,
+    BRIG_WIDTH_1048576 = 21,
+    BRIG_WIDTH_2097152 = 22,
+    BRIG_WIDTH_4194304 = 23,
+    BRIG_WIDTH_8388608 = 24,
+    BRIG_WIDTH_16777216 = 25,
+    BRIG_WIDTH_33554432 = 26,
+    BRIG_WIDTH_67108864 = 27,
+    BRIG_WIDTH_134217728 = 28,
+    BRIG_WIDTH_268435456 = 29,
+    BRIG_WIDTH_536870912 = 30,
+    BRIG_WIDTH_1073741824 = 31,
+    BRIG_WIDTH_2147483648 = 32,
+    BRIG_WIDTH_WAVESIZE = 33,
+    BRIG_WIDTH_ALL = 34,
+
+    BRIG_WIDTH_LAST //.skip
+};
+
+struct BrigUInt64 { //.isroot //.standalone
+    uint32_t lo;     //.defValue=0
+    uint32_t hi;     //.defValue=0
+
+    //+hcode KLASS& operator=(uint64_t rhs);
+    //+hcode operator uint64_t();
+    //+implcode inline KLASS& KLASS::operator=(uint64_t rhs) { lo() = (uint32_t)rhs; hi() = (uint32_t)(rhs >> 32); return *this; }
+    //+implcode inline KLASS::operator uint64_t() { return ((uint64_t)hi()) << 32 | lo(); }
+};
+
+struct BrigAluModifier { //.isroot //.standalone
+    BrigAluModifier8_t allBits; //.defValue=0
+    //^^ bool ftz; //.wtype=BitValRef<0>
+};
+
+struct BrigBase { //.nowrap
+    uint16_t byteCount;
+    BrigKind16_t kind;
+};
+
+//.alias Code:Base { //.generic //.isroot //.section=BRIG_SECTION_INDEX_CODE };
+//.alias Directive:Code { //.generic };
+//.alias Operand:Base { //.generic //.isroot //.section=BRIG_SECTION_INDEX_OPERAND };
+
+struct BrigData {
+    //.nowrap
+    uint32_t byteCount;
+    uint8_t bytes[1];
+};
+
+struct BrigExecutableModifier { //.isroot //.standalone
+    BrigExecutableModifier8_t allBits; //.defValue=0
+    //^^ bool isDefinition; //.wtype=BitValRef<0>
+};
+
+struct BrigMemoryModifier { //.isroot //.standalone
+    BrigMemoryModifier8_t allBits; //.defValue=0
+    //^^ bool isConst; //.wtype=BitValRef<0>
+};
+
+struct BrigSegCvtModifier { //.isroot //.standalone
+    BrigSegCvtModifier8_t allBits; //.defValue=0
+    //^^ bool isNoNull; //.wtype=BitValRef<0>
+};
+
+struct BrigVariableModifier { //.isroot //.standalone
+    BrigVariableModifier8_t allBits;    //.defValue=0
+
+    //^^ bool isDefinition;     //.wtype=BitValRef<0>
+    //^^ bool isConst;          //.wtype=BitValRef<1>
+};
+
+struct BrigDirectiveArgBlockEnd {
+    BrigBase base;
+};
+
+struct BrigDirectiveArgBlockStart {
+    BrigBase base;
+};
+
+struct BrigDirectiveComment {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveControl {
+    BrigBase base;
+    BrigControlDirective16_t control;
+    uint16_t reserved; //.defValue=0
+    BrigDataOffsetOperandList32_t operands;
+};
+
+struct BrigDirectiveExecutable { //.generic
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+    uint16_t outArgCount; //.defValue=0
+    uint16_t inArgCount;  //.defValue=0
+    BrigCodeOffset32_t firstInArg;
+    BrigCodeOffset32_t firstCodeBlockEntry;
+    BrigCodeOffset32_t nextModuleEntry;
+    BrigExecutableModifier modifier; //.acc=subItem<ExecutableModifier> //.wtype=ExecutableModifier
+    BrigLinkage8_t linkage;
+    uint16_t reserved; //.defValue=0
+};
+
+//.alias DirectiveKernel:DirectiveExecutable { };
+//.alias DirectiveFunction:DirectiveExecutable { };
+//.alias DirectiveSignature:DirectiveExecutable { };
+//.alias DirectiveIndirectFunction:DirectiveExecutable { };
+
+struct BrigDirectiveExtension {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveFbarrier {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+    BrigVariableModifier modifier; //.acc=subItem<VariableModifier> //.wtype=VariableModifier
+    BrigLinkage8_t linkage;
+    uint16_t reserved; //.defValue=0
+};
+
+struct BrigDirectiveLabel {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveLoc {
+    BrigBase base;
+    BrigDataOffsetString32_t filename;
+    uint32_t line;
+    uint32_t column; //.defValue=1
+};
+
+struct BrigDirectiveNone { //.enum=BRIG_KIND_NONE
+    BrigBase base;
+};
+
+struct BrigDirectivePragma {
+    BrigBase base;
+    BrigDataOffsetOperandList32_t operands;
+};
+
+struct BrigDirectiveVariable {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+    BrigOperandOffset32_t init;
+    BrigType16_t type;
+
+    //+hcode bool isArray();
+    //+implcode inline bool KLASS::isArray() { return isArrayType(type()); }
+
+    //+hcode unsigned elementType();
+    //+implcode inline unsigned KLASS::elementType() { return isArray()? arrayType2elementType(type()) : type(); }
+
+    BrigSegment8_t segment;
+    BrigAlignment8_t align;
+    BrigUInt64 dim; //.acc=subItem<UInt64> //.wtype=UInt64
+    BrigVariableModifier modifier; //.acc=subItem<VariableModifier> //.wtype=VariableModifier
+    BrigLinkage8_t linkage;
+    BrigAllocation8_t allocation;
+    uint8_t reserved; //.defValue=0
+};
+
+struct BrigDirectiveModule {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+    BrigVersion32_t hsailMajor;         //.wtype=ValRef<uint32_t>
+    BrigVersion32_t hsailMinor;         //.wtype=ValRef<uint32_t>
+    BrigProfile8_t profile;
+    BrigMachineModel8_t machineModel;
+    BrigRound8_t defaultFloatRound;
+    uint8_t reserved;                   //.defValue=0
+};
+
+struct BrigInstBase { //.wname=Inst //.generic //.parent=BrigCode
+    BrigBase base;
+    BrigOpcode16_t opcode;
+    BrigType16_t type;
+    BrigDataOffsetOperandList32_t operands;
+
+    //+hcode Operand operand(int index);
+    //+implcode inline Operand KLASS::operand(int index) { return operands()[index]; }
+};
+
+struct BrigInstAddr {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstAtomic {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    BrigMemoryOrder8_t memoryOrder;
+    BrigMemoryScope8_t memoryScope;
+    BrigAtomicOperation8_t atomicOperation;
+    uint8_t equivClass;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstBasic {
+    BrigInstBase base;
+};
+
+struct BrigInstBr {
+    BrigInstBase base;
+    BrigWidth8_t width;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstCmp {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    BrigAluModifier modifier; //.acc=subItem<AluModifier> //.wtype=AluModifier
+    BrigCompareOperation8_t compare;
+    BrigPack8_t pack;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstCvt {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    BrigAluModifier modifier; //.acc=subItem<AluModifier> //.wtype=AluModifier
+    BrigRound8_t round;
+};
+
+struct BrigInstImage {
+    BrigInstBase base;
+    BrigType16_t imageType;
+    BrigType16_t coordType;
+    BrigImageGeometry8_t geometry;
+    uint8_t equivClass;
+    uint16_t reserved; //.defValue=0
+};
+
+struct BrigInstLane {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    BrigWidth8_t width;
+    uint8_t reserved; //.defValue=0
+};
+
+struct BrigInstMem {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    BrigAlignment8_t align;
+    uint8_t equivClass;
+    BrigWidth8_t width;
+    BrigMemoryModifier modifier; //.acc=subItem<MemoryModifier> //.wtype=MemoryModifier
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstMemFence {
+    BrigInstBase base;
+    BrigMemoryOrder8_t memoryOrder;
+    BrigMemoryScope8_t globalSegmentMemoryScope;
+    BrigMemoryScope8_t groupSegmentMemoryScope;
+    BrigMemoryScope8_t imageSegmentMemoryScope;
+};
+
+struct BrigInstMod {
+    BrigInstBase base;
+    BrigAluModifier modifier; //.acc=subItem<AluModifier> //.wtype=AluModifier
+    BrigRound8_t round;
+    BrigPack8_t pack;
+    uint8_t reserved; //.defValue=0
+};
+
+struct BrigInstQueryImage {
+    BrigInstBase base;
+    BrigType16_t imageType;
+    BrigImageGeometry8_t geometry;
+    BrigImageQuery8_t imageQuery;
+};
+
+struct BrigInstQuerySampler {
+    BrigInstBase base;
+    BrigSamplerQuery8_t samplerQuery;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstQueue {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    BrigMemoryOrder8_t memoryOrder;
+    uint16_t reserved; //.defValue=0
+};
+
+struct BrigInstSeg {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstSegCvt {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    BrigSegment8_t segment;
+    BrigSegCvtModifier modifier; //.acc=subItem<SegCvtModifier> //.wtype=SegCvtModifier
+};
+
+struct BrigInstSignal {
+    BrigInstBase base;
+    BrigType16_t signalType;
+    BrigMemoryOrder8_t memoryOrder;
+    BrigAtomicOperation8_t signalOperation;
+};
+
+struct BrigInstSourceType {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    uint16_t reserved; //.defValue=0
+};
+
+struct BrigOperandAddress {
+    BrigBase base;
+    BrigCodeOffset32_t symbol; //.wtype=ItemRef<DirectiveVariable>
+    BrigOperandOffset32_t reg; //.wtype=ItemRef<OperandRegister>
+    BrigUInt64 offset; //.acc=subItem<UInt64> //.wtype=UInt64
+};
+
+struct BrigOperandAlign {
+    BrigBase base;
+    BrigAlignment8_t align;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigOperandCodeList {
+    BrigBase base;
+    BrigDataOffsetCodeList32_t elements;
+
+    //+hcode unsigned elementCount();
+    //+implcode inline unsigned KLASS::elementCount() { return elements().size(); }
+    //+hcode Code elements(int index);
+    //+implcode inline Code KLASS::elements(int index) { return elements()[index]; }
+};
+
+struct BrigOperandCodeRef {
+    BrigBase base;
+    BrigCodeOffset32_t ref;
+};
+
+struct BrigOperandConstantBytes {
+    BrigBase base;
+    BrigType16_t type; //.defValue=0
+    uint16_t reserved; //.defValue=0
+    BrigDataOffsetString32_t bytes;
+};
+
+struct BrigOperandConstantOperandList {
+    BrigBase base;
+    BrigType16_t type;
+    uint16_t reserved; //.defValue=0
+    BrigDataOffsetOperandList32_t elements;
+
+    //+hcode unsigned elementCount();
+    //+implcode inline unsigned KLASS::elementCount() { return elements().size(); }
+    //+hcode Operand elements(int index);
+    //+implcode inline Operand KLASS::elements(int index) { return elements()[index]; }
+};
+
+struct BrigOperandConstantImage {
+    BrigBase base;
+    BrigType16_t type;
+    BrigImageGeometry8_t geometry;
+    BrigImageChannelOrder8_t channelOrder;
+    BrigImageChannelType8_t channelType;
+    uint8_t reserved[3]; //.defValue=0
+    BrigUInt64 width;    //.acc=subItem<UInt64> //.wtype=UInt64
+    BrigUInt64 height;   //.acc=subItem<UInt64> //.wtype=UInt64
+    BrigUInt64 depth;    //.acc=subItem<UInt64> //.wtype=UInt64
+    BrigUInt64 array;    //.acc=subItem<UInt64> //.wtype=UInt64
+};
+
+struct BrigOperandOperandList {
+    BrigBase base;
+    BrigDataOffsetOperandList32_t elements;
+
+    //+hcode unsigned elementCount();
+    //+implcode inline unsigned KLASS::elementCount() { return elements().size(); }
+    //+hcode Operand elements(int index);
+    //+implcode inline Operand KLASS::elements(int index) { return elements()[index]; }
+};
+
+struct BrigOperandRegister {
+    BrigBase base;
+    BrigRegisterKind16_t regKind;
+    uint16_t regNum;
+};
+
+struct BrigOperandConstantSampler {
+    BrigBase base;
+    BrigType16_t type;
+    BrigSamplerCoordNormalization8_t coord;
+    BrigSamplerFilter8_t filter;
+    BrigSamplerAddressing8_t addressing;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigOperandString {
+    BrigBase base;
+    BrigDataOffsetString32_t string;
+};
+
+struct BrigOperandWavesize {
+    BrigBase base;
+};
+
+//.ignore{
+
+enum BrigExceptionsMask {
+    BRIG_EXCEPTIONS_INVALID_OPERATION = 1 << 0,
+    BRIG_EXCEPTIONS_DIVIDE_BY_ZERO = 1 << 1,
+    BRIG_EXCEPTIONS_OVERFLOW = 1 << 2,
+    BRIG_EXCEPTIONS_UNDERFLOW = 1 << 3,
+    BRIG_EXCEPTIONS_INEXACT = 1 << 4,
+
+    BRIG_EXCEPTIONS_FIRST_USER_DEFINED = 1 << 16
+};
+
+struct BrigSectionHeader {
+    uint64_t byteCount;
+    uint32_t headerByteCount;
+    uint32_t nameLength;
+    uint8_t name[1];
+};
+
+#define MODULE_IDENTIFICATION_LENGTH (8)
+
+struct BrigModuleHeader {
+    char identification[MODULE_IDENTIFICATION_LENGTH];
+    BrigVersion32_t brigMajor;
+    BrigVersion32_t brigMinor;
+    uint64_t byteCount;
+    uint8_t hash[64];
+    uint32_t reserved;
+    uint32_t sectionCount;
+    uint64_t sectionIndex;
+};
+
+typedef BrigModuleHeader* BrigModule_t;
+
+#endif // defined(INCLUDED_BRIG_H)
+//}
diff --git a/src/arch/hsail/SConscript b/src/arch/hsail/SConscript
new file mode 100644
index 000000000..3455823a6
--- /dev/null
+++ b/src/arch/hsail/SConscript
@@ -0,0 +1,54 @@
+# -*- mode:python -*-
+
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Anthony Gutierrez
+#
+
+Import('*')
+
+if not env['BUILD_GPU']:
+    Return()
+
+if env['TARGET_GPU_ISA'] == 'hsail':
+    env.Command(['insts/gen_decl.hh', 'gpu_decoder.cc', 'insts/gen_exec.cc'],
+                'gen.py', '$SOURCE $TARGETS')
+
+    Source('generic_types.cc')
+    Source('gpu_decoder.cc')
+    Source('insts/branch.cc')
+    Source('insts/gen_exec.cc')
+    Source('insts/gpu_static_inst.cc')
+    Source('insts/main.cc')
+    Source('insts/pseudo_inst.cc')
+    Source('insts/mem.cc')
+    Source('operand.cc')
diff --git a/src/arch/hsail/SConsopts b/src/arch/hsail/SConsopts
new file mode 100644
index 000000000..641963c82
--- /dev/null
+++ b/src/arch/hsail/SConsopts
@@ -0,0 +1,40 @@
+# -*- mode:python -*-
+
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Anthony Gutierrez
+#
+
+Import('*')
+
+all_gpu_isa_list.append('hsail')
diff --git a/src/arch/hsail/gen.py b/src/arch/hsail/gen.py
new file mode 100755
index 000000000..f2996019b
--- /dev/null
+++ b/src/arch/hsail/gen.py
@@ -0,0 +1,806 @@
+#! /usr/bin/python
+
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Steve Reinhardt
+#
+
+import sys, re
+
+from m5.util import code_formatter
+
+if len(sys.argv) != 4:
+    print "Error: need 3 args (file names)"
+    sys.exit(0)
+
+header_code = code_formatter()
+decoder_code = code_formatter()
+exec_code = code_formatter()
+
+###############
+#
+# Generate file prologs (includes etc.)
+#
+###############
+
+header_code('''
+#include "arch/hsail/insts/decl.hh"
+#include "base/bitfield.hh"
+#include "gpu-compute/hsail_code.hh"
+#include "gpu-compute/wavefront.hh"
+
+namespace HsailISA
+{
+''')
+header_code.indent()
+
+decoder_code('''
+#include "arch/hsail/gpu_decoder.hh"
+#include "arch/hsail/insts/branch.hh"
+#include "arch/hsail/insts/decl.hh"
+#include "arch/hsail/insts/gen_decl.hh"
+#include "arch/hsail/insts/mem.hh"
+#include "arch/hsail/insts/mem_impl.hh"
+#include "gpu-compute/brig_object.hh"
+
+namespace HsailISA
+{
+    std::vector<GPUStaticInst*> Decoder::decodedInsts;
+
+    GPUStaticInst*
+    Decoder::decode(MachInst machInst)
+    {
+        using namespace Brig;
+
+        const BrigInstBase *ib = machInst.brigInstBase;
+        const BrigObject *obj = machInst.brigObj;
+
+        switch(ib->opcode) {
+''')
+decoder_code.indent()
+decoder_code.indent()
+
+exec_code('''
+#include "arch/hsail/insts/gen_decl.hh"
+#include "base/intmath.hh"
+
+namespace HsailISA
+{
+''')
+exec_code.indent()
+
+###############
+#
+# Define code templates for class declarations (for header file)
+#
+###############
+
+# Basic header template for an instruction with no template parameters.
+header_template_nodt = '''
+class $class_name : public $base_class
+{
+  public:
+    typedef $base_class Base;
+
+    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+       : Base(ib, obj, "$opcode")
+    {
+    }
+
+    void execute(GPUDynInstPtr gpuDynInst);
+};
+
+'''
+
+# Basic header template for an instruction with a single DataType
+# template parameter.
+header_template_1dt = '''
+template<typename DataType>
+class $class_name : public $base_class<DataType>
+{
+  public:
+    typedef $base_class<DataType> Base;
+    typedef typename DataType::CType CType;
+
+    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+       : Base(ib, obj, "$opcode")
+    {
+    }
+
+    void execute(GPUDynInstPtr gpuDynInst);
+};
+
+'''
+
+header_template_1dt_noexec = '''
+template<typename DataType>
+class $class_name : public $base_class<DataType>
+{
+  public:
+    typedef $base_class<DataType> Base;
+    typedef typename DataType::CType CType;
+
+    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+       : Base(ib, obj, "$opcode")
+    {
+    }
+};
+
+'''
+
+# Same as header_template_1dt, except the base class has a second
+# template parameter NumSrcOperands to allow a variable number of
+# source operands.  Note that since this is implemented with an array,
+# it only works for instructions where all sources are of the same
+# type (like most arithmetics).
+header_template_1dt_varsrcs = '''
+template<typename DataType>
+class $class_name : public $base_class<DataType, $num_srcs>
+{
+  public:
+    typedef $base_class<DataType, $num_srcs> Base;
+    typedef typename DataType::CType CType;
+
+    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+       : Base(ib, obj, "$opcode")
+    {
+    }
+
+    void execute(GPUDynInstPtr gpuDynInst);
+};
+
+'''
+
+# Header template for instruction with two DataType template
+# parameters, one for the dest and one for the source.  This is used
+# by compare and convert.
+header_template_2dt = '''
+template<typename DestDataType, class SrcDataType>
+class $class_name : public $base_class<DestDataType, SrcDataType>
+{
+  public:
+    typedef $base_class<DestDataType, SrcDataType> Base;
+    typedef typename DestDataType::CType DestCType;
+    typedef typename SrcDataType::CType SrcCType;
+
+    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+       : Base(ib, obj, "$opcode")
+    {
+    }
+
+    void execute(GPUDynInstPtr gpuDynInst);
+};
+
+'''
+
+header_templates = {
+    'ArithInst': header_template_1dt_varsrcs,
+    'CmovInst': header_template_1dt,
+    'ClassInst': header_template_1dt,
+    'ShiftInst': header_template_1dt,
+    'ExtractInsertInst': header_template_1dt,
+    'CmpInst': header_template_2dt,
+    'CvtInst': header_template_2dt,
+    'LdInst': '',
+    'StInst': '',
+    'SpecialInstNoSrc': header_template_nodt,
+    'SpecialInst1Src': header_template_nodt,
+    'SpecialInstNoSrcNoDest': '',
+}
+
+###############
+#
+# Define code templates for exec functions
+#
+###############
+
+# exec function body
+exec_template_nodt_nosrc = '''
+void
+$class_name::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    typedef Base::DestCType DestCType;
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            DestCType dest_val = $expr;
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_nodt_1src = '''
+void
+$class_name::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    typedef Base::DestCType DestCType;
+    typedef Base::SrcCType  SrcCType;
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            SrcCType src_val0 = this->src0.get<SrcCType>(w, lane);
+            DestCType dest_val = $expr;
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_1dt_varsrcs = '''
+template<typename DataType>
+void
+$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            CType dest_val;
+            if ($dest_is_src_flag) {
+                dest_val = this->dest.template get<CType>(w, lane);
+            }
+
+            CType src_val[$num_srcs];
+
+            for (int i = 0; i < $num_srcs; ++i) {
+                src_val[i] = this->src[i].template get<CType>(w, lane);
+            }
+
+            dest_val = (CType)($expr);
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_1dt_3srcs = '''
+template<typename DataType>
+void
+$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    typedef typename Base::Src0CType Src0T;
+    typedef typename Base::Src1CType Src1T;
+    typedef typename Base::Src2CType Src2T;
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            CType dest_val;
+
+            if ($dest_is_src_flag) {
+                dest_val = this->dest.template get<CType>(w, lane);
+            }
+
+            Src0T src_val0 = this->src0.template get<Src0T>(w, lane);
+            Src1T src_val1 = this->src1.template get<Src1T>(w, lane);
+            Src2T src_val2 = this->src2.template get<Src2T>(w, lane);
+
+            dest_val = $expr;
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_1dt_2src_1dest = '''
+template<typename DataType>
+void
+$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    typedef typename Base::DestCType DestT;
+    typedef CType Src0T;
+    typedef typename Base::Src1CType Src1T;
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            DestT dest_val;
+            if ($dest_is_src_flag) {
+                dest_val = this->dest.template get<DestT>(w, lane);
+            }
+            Src0T src_val0 = this->src0.template get<Src0T>(w, lane);
+            Src1T src_val1 = this->src1.template get<Src1T>(w, lane);
+
+            dest_val = $expr;
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_shift = '''
+template<typename DataType>
+void
+$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    const VectorMask &mask = w->get_pred();
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            CType dest_val;
+
+            if ($dest_is_src_flag) {
+                dest_val = this->dest.template get<CType>(w, lane);
+            }
+
+            CType src_val0 = this->src0.template get<CType>(w, lane);
+            uint32_t src_val1 = this->src1.template get<uint32_t>(w, lane);
+
+            dest_val = $expr;
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_2dt = '''
+template<typename DestDataType, class SrcDataType>
+void
+$class_name<DestDataType, SrcDataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            DestCType dest_val;
+            SrcCType src_val[$num_srcs];
+
+            for (int i = 0; i < $num_srcs; ++i) {
+                src_val[i] = this->src[i].template get<SrcCType>(w, lane);
+            }
+
+            dest_val = $expr;
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_templates = {
+    'ArithInst': exec_template_1dt_varsrcs,
+    'CmovInst': exec_template_1dt_3srcs,
+    'ExtractInsertInst': exec_template_1dt_3srcs,
+    'ClassInst': exec_template_1dt_2src_1dest,
+    'CmpInst': exec_template_2dt,
+    'CvtInst': exec_template_2dt,
+    'LdInst': '',
+    'StInst': '',
+    'SpecialInstNoSrc': exec_template_nodt_nosrc,
+    'SpecialInst1Src': exec_template_nodt_1src,
+    'SpecialInstNoSrcNoDest': '',
+}
+
+###############
+#
+# Define code templates for the decoder cases
+#
+###############
+
+# decode template for nodt-opcode case
+decode_nodt_template = '''
+  case BRIG_OPCODE_$brig_opcode_upper: return $constructor(ib, obj);'''
+
+decode_case_prolog_class_inst = '''
+  case BRIG_OPCODE_$brig_opcode_upper:
+    {
+        //const BrigOperandBase *baseOp = obj->getOperand(ib->operands[1]);
+        BrigType16_t type = ((BrigInstSourceType*)ib)->sourceType;
+        //switch (baseOp->kind) {
+        //    case BRIG_OPERAND_REG:
+        //        type = ((const BrigOperandReg*)baseOp)->type;
+        //        break;
+        //    case BRIG_OPERAND_IMMED:
+        //        type = ((const BrigOperandImmed*)baseOp)->type;
+        //        break;
+        //    default:
+        //        fatal("CLASS unrecognized kind of operand %d\\n",
+        //               baseOp->kind);
+        //}
+        switch (type) {'''
+
+# common prolog for 1dt- or 2dt-opcode case: switch on data type
+decode_case_prolog = '''
+  case BRIG_OPCODE_$brig_opcode_upper:
+    {
+        switch (ib->type) {'''
+
+# single-level decode case entry (for 1dt opcodes)
+decode_case_entry = \
+'      case BRIG_TYPE_$type_name: return $constructor(ib, obj);'
+
+decode_store_prolog = \
+'      case BRIG_TYPE_$type_name: {'
+
+decode_store_case_epilog = '''
+    }'''
+
+decode_store_case_entry = \
+'          return $constructor(ib, obj);'
+
+# common epilog for type switch
+decode_case_epilog = '''
+          default: fatal("$brig_opcode_upper: unrecognized type %d\\n",
+              ib->type);
+        }
+    }
+    break;'''
+
+# Additional templates for nested decode on a second type field (for
+# compare and convert).  These are used in place of the
+# decode_case_entry template to create a second-level switch on on the
+# second type field inside each case of the first-level type switch.
+# Because the name and location of the second type can vary, the Brig
+# instruction type must be provided in $brig_type, and the name of the
+# second type field must be provided in $type_field.
+decode_case2_prolog = '''
+        case BRIG_TYPE_$type_name:
+          switch (((Brig$brig_type*)ib)->$type2_field) {'''
+
+decode_case2_entry = \
+'          case BRIG_TYPE_$type2_name: return $constructor(ib, obj);'
+
+decode_case2_epilog = '''
+          default: fatal("$brig_opcode_upper: unrecognized $type2_field %d\\n",
+                         ((Brig$brig_type*)ib)->$type2_field);
+        }
+        break;'''
+
+# Figure out how many source operands an expr needs by looking for the
+# highest-numbered srcN value referenced.  Since sources are numbered
+# starting at 0, the return value is N+1.
+def num_src_operands(expr):
+    if expr.find('src2') != -1:
+        return 3
+    elif expr.find('src1') != -1:
+        return 2
+    elif expr.find('src0') != -1:
+        return 1
+    else:
+        return 0
+
+###############
+#
+# Define final code generation methods
+#
+# The gen_nodt, and gen_1dt, and gen_2dt methods are the interface for
+# generating actual instructions.
+#
+###############
+
+# Generate class declaration, exec function, and decode switch case
+# for an brig_opcode with a single-level type switch.  The 'types'
+# parameter is a list or tuple of types for which the instruction
+# should be instantiated.
+def gen(brig_opcode, types=None, expr=None, base_class='ArithInst',
+        type2_info=None, constructor_prefix='new ', is_store=False):
+    brig_opcode_upper = brig_opcode.upper()
+    class_name = brig_opcode
+    opcode = class_name.lower()
+
+    if base_class == 'ArithInst':
+        # note that expr must be provided with ArithInst so we can
+        # derive num_srcs for the template
+        assert expr
+
+    if expr:
+        # Derive several bits of info from expr.  If expr is not used,
+        # this info will be irrelevant.
+        num_srcs = num_src_operands(expr)
+        # if the RHS expression includes 'dest', then we're doing an RMW
+        # on the reg and we need to treat it like a source
+        dest_is_src = expr.find('dest') != -1
+        dest_is_src_flag = str(dest_is_src).lower() # for C++
+        if base_class in ['ShiftInst']:
+            expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr)
+        elif base_class in ['ArithInst', 'CmpInst', 'CvtInst']:
+            expr = re.sub(r'\bsrc(\d)\b', r'src_val[\1]', expr)
+        else:
+            expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr)
+        expr = re.sub(r'\bdest\b', r'dest_val', expr)
+
+    # Strip template arguments off of base class before looking up
+    # appropriate templates
+    base_class_base = re.sub(r'<.*>$', '', base_class)
+    header_code(header_templates[base_class_base])
+
+    if base_class.startswith('SpecialInst'):
+        exec_code(exec_templates[base_class_base])
+    elif base_class.startswith('ShiftInst'):
+        header_code(exec_template_shift)
+    else:
+        header_code(exec_templates[base_class_base])
+
+    if not types or isinstance(types, str):
+        # Just a single type
+        constructor = constructor_prefix + class_name
+        decoder_code(decode_nodt_template)
+    else:
+        # multiple types, need at least one level of decode
+        if brig_opcode == 'Class':
+            decoder_code(decode_case_prolog_class_inst)
+        else:
+            decoder_code(decode_case_prolog)
+        if not type2_info:
+            if is_store == False:
+                # single list of types, to basic one-level decode
+                for type_name in types:
+                    full_class_name = '%s<%s>' % (class_name, type_name.upper())
+                    constructor = constructor_prefix + full_class_name
+                    decoder_code(decode_case_entry)
+            else:
+                # single list of types, to basic one-level decode
+                for type_name in types:
+                    decoder_code(decode_store_prolog)
+                    type_size = int(re.findall(r'[0-9]+', type_name)[0])
+                    src_size = 32
+                    type_type = type_name[0]
+                    full_class_name = '%s<%s,%s>' % (class_name, \
+                                                     type_name.upper(), \
+                                                     '%s%d' % \
+                                                     (type_type.upper(), \
+                                                     type_size))
+                    constructor = constructor_prefix + full_class_name
+                    decoder_code(decode_store_case_entry)
+                    decoder_code(decode_store_case_epilog)
+        else:
+            # need secondary type switch (convert, compare)
+            # unpack extra info on second switch
+            (type2_field, types2) = type2_info
+            brig_type = 'Inst%s' % brig_opcode
+            for type_name in types:
+                decoder_code(decode_case2_prolog)
+                fmt = '%s<%s,%%s>' % (class_name, type_name.upper())
+                for type2_name in types2:
+                    full_class_name = fmt % type2_name.upper()
+                    constructor = constructor_prefix + full_class_name
+                    decoder_code(decode_case2_entry)
+
+                decoder_code(decode_case2_epilog)
+
+        decoder_code(decode_case_epilog)
+
+###############
+#
+# Generate instructions
+#
+###############
+
+# handy abbreviations for common sets of types
+
+# arithmetic ops are typically defined only on 32- and 64-bit sizes
+arith_int_types = ('S32', 'U32', 'S64', 'U64')
+arith_float_types = ('F32', 'F64')
+arith_types = arith_int_types + arith_float_types
+
+bit_types = ('B1', 'B32', 'B64')
+
+all_int_types = ('S8', 'U8', 'S16', 'U16') + arith_int_types
+
+# I think you might be able to do 'f16' memory ops too, but we'll
+# ignore them for now.
+mem_types = all_int_types + arith_float_types
+mem_atom_types = all_int_types + ('B32', 'B64')
+
+##### Arithmetic & logical operations
+gen('Add', arith_types, 'src0 + src1')
+gen('Sub', arith_types, 'src0 - src1')
+gen('Mul', arith_types, 'src0 * src1')
+gen('Div', arith_types, 'src0 / src1')
+gen('Min', arith_types, 'std::min(src0, src1)')
+gen('Max', arith_types, 'std::max(src0, src1)')
+gen('Gcnmin', arith_types, 'std::min(src0, src1)')
+
+gen('CopySign', arith_float_types,
+    'src1 < 0 ? -std::abs(src0) : std::abs(src0)')
+gen('Sqrt', arith_float_types, 'sqrt(src0)')
+gen('Floor', arith_float_types, 'floor(src0)')
+
+# "fast" sqrt... same as slow for us
+gen('Nsqrt', arith_float_types, 'sqrt(src0)')
+gen('Nrsqrt', arith_float_types, '1.0/sqrt(src0)')
+gen('Nrcp', arith_float_types, '1.0/src0')
+gen('Fract', arith_float_types,
+    '(src0 >= 0.0)?(src0-floor(src0)):(floor(src0)-src0)')
+
+gen('Ncos', arith_float_types, 'cos(src0)');
+gen('Nsin', arith_float_types, 'sin(src0)');
+
+gen('And', bit_types, 'src0 & src1')
+gen('Or', bit_types,  'src0 | src1')
+gen('Xor', bit_types, 'src0 ^ src1')
+
+gen('Bitselect', bit_types, '(src1 & src0) | (src2 & ~src0)')
+gen('Firstbit',bit_types, 'firstbit(src0)')
+gen('Popcount', ('B32', 'B64'), '__builtin_popcount(src0)')
+
+gen('Shl', arith_int_types, 'src0 << (unsigned)src1', 'ShiftInst')
+gen('Shr', arith_int_types, 'src0 >> (unsigned)src1', 'ShiftInst')
+
+# gen('Mul_hi', types=('s32','u32', '??'))
+# gen('Mul24', types=('s32','u32', '??'))
+gen('Rem', arith_int_types, 'src0 - ((src0 / src1) * src1)')
+
+gen('Abs', arith_types, 'std::abs(src0)')
+gen('Neg', arith_types, '-src0')
+
+gen('Mov', bit_types, 'src0')
+gen('Not', bit_types, 'heynot(src0)')
+
+# mad and fma differ only in rounding behavior, which we don't emulate
+# also there's an integer form of mad, but not of fma
+gen('Mad', arith_types, 'src0 * src1 + src2')
+gen('Fma', arith_float_types, 'src0 * src1 + src2')
+
+#native floating point operations
+gen('Nfma', arith_float_types, 'src0 * src1 + src2')
+
+gen('Cmov', bit_types, 'src0 ? src1 : src2', 'CmovInst')
+gen('BitAlign', bit_types, '(src0 << src2)|(src1 >> (32 - src2))')
+gen('ByteAlign', bit_types, '(src0 << 8 * src2)|(src1 >> (32 - 8 * src2))')
+
+# see base/bitfield.hh
+gen('BitExtract', arith_int_types, 'bits(src0, src1, src1 + src2 - 1)',
+    'ExtractInsertInst')
+
+gen('BitInsert', arith_int_types, 'insertBits(dest, src1, src2, src0)',
+    'ExtractInsertInst')
+
+##### Compare
+gen('Cmp', ('B1', 'S32', 'U32', 'F32'), 'compare(src0, src1, this->cmpOp)',
+    'CmpInst', ('sourceType', arith_types + bit_types))
+gen('Class', arith_float_types, 'fpclassify(src0,src1)','ClassInst')
+
+##### Conversion
+
+# Conversion operations are only defined on B1, not B32 or B64
+cvt_types = ('B1',) + mem_types
+
+gen('Cvt', cvt_types, 'src0', 'CvtInst', ('sourceType', cvt_types))
+
+
+##### Load & Store
+gen('Lda', mem_types, base_class = 'LdInst', constructor_prefix='decode')
+gen('Ld', mem_types, base_class = 'LdInst', constructor_prefix='decode')
+gen('St', mem_types, base_class = 'StInst', constructor_prefix='decode',
+    is_store=True)
+gen('Atomic', mem_atom_types, base_class='StInst', constructor_prefix='decode')
+gen('AtomicNoRet', mem_atom_types, base_class='StInst',
+    constructor_prefix='decode')
+
+gen('Cbr', base_class = 'LdInst', constructor_prefix='decode')
+gen('Br', base_class = 'LdInst', constructor_prefix='decode')
+
+##### Special operations
+def gen_special(brig_opcode, expr, dest_type='U32'):
+    num_srcs = num_src_operands(expr)
+    if num_srcs == 0:
+        base_class = 'SpecialInstNoSrc<%s>' % dest_type
+    elif num_srcs == 1:
+        base_class = 'SpecialInst1Src<%s>' % dest_type
+    else:
+        assert false
+
+    gen(brig_opcode, None, expr, base_class)
+
+gen_special('WorkItemId', 'w->workitemid[src0][lane]')
+gen_special('WorkItemAbsId',
+    'w->workitemid[src0][lane] + (w->workgroupid[src0] * w->workgroupsz[src0])')
+gen_special('WorkGroupId', 'w->workgroupid[src0]')
+gen_special('WorkGroupSize', 'w->workgroupsz[src0]')
+gen_special('CurrentWorkGroupSize', 'w->workgroupsz[src0]')
+gen_special('GridSize', 'w->gridsz[src0]')
+gen_special('GridGroups',
+    'divCeil(w->gridsz[src0],w->workgroupsz[src0])')
+gen_special('LaneId', 'lane')
+gen_special('WaveId', 'w->dynwaveid')
+gen_special('Clock', 'w->computeUnit->shader->tick_cnt', 'U64')
+
+# gen_special('CU'', ')
+
+gen('Ret', base_class='SpecialInstNoSrcNoDest')
+gen('Barrier', base_class='SpecialInstNoSrcNoDest')
+gen('MemFence', base_class='SpecialInstNoSrcNoDest')
+
+# Map magic instructions to the BrigSyscall opcode
+# Magic instructions are defined in magic.hh
+#
+# In the future, real HSA kernel system calls can be implemented and coexist
+# with magic instructions.
+gen('Call', base_class='SpecialInstNoSrcNoDest')
+
+###############
+#
+# Generate file epilogs
+#
+###############
+header_code.dedent()
+header_code('''
+} // namespace HsailISA
+''')
+
+# close off main decode switch
+decoder_code.dedent()
+decoder_code.dedent()
+decoder_code('''
+          default: fatal("unrecognized Brig opcode %d\\n", ib->opcode);
+        } // end switch(ib->opcode)
+    } // end decode()
+} // namespace HsailISA
+''')
+
+exec_code.dedent()
+exec_code('''
+} // namespace HsailISA
+''')
+
+###############
+#
+# Output accumulated code to files
+#
+###############
+header_code.write(sys.argv[1])
+decoder_code.write(sys.argv[2])
+exec_code.write(sys.argv[3])
diff --git a/src/arch/hsail/generic_types.cc b/src/arch/hsail/generic_types.cc
new file mode 100644
index 000000000..0cd55d1d5
--- /dev/null
+++ b/src/arch/hsail/generic_types.cc
@@ -0,0 +1,47 @@
+#include "arch/hsail/generic_types.hh"
+#include "base/misc.hh"
+
+using namespace Brig;
+
+namespace HsailISA
+{
+    Enums::GenericMemoryOrder
+    getGenericMemoryOrder(BrigMemoryOrder brig_memory_order)
+    {
+        switch(brig_memory_order) {
+          case BRIG_MEMORY_ORDER_NONE:
+            return Enums::MEMORY_ORDER_NONE;
+          case BRIG_MEMORY_ORDER_RELAXED:
+            return Enums::MEMORY_ORDER_RELAXED;
+          case BRIG_MEMORY_ORDER_SC_ACQUIRE:
+            return Enums::MEMORY_ORDER_SC_ACQUIRE;
+          case BRIG_MEMORY_ORDER_SC_RELEASE:
+            return Enums::MEMORY_ORDER_SC_RELEASE;
+          case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
+            return Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE;
+          default:
+            fatal("HsailISA::MemInst::getGenericMemoryOrder -> ",
+                  "bad BrigMemoryOrder\n");
+        }
+    }
+
+    Enums::GenericMemoryScope
+    getGenericMemoryScope(BrigMemoryScope brig_memory_scope)
+    {
+        switch(brig_memory_scope) {
+          case BRIG_MEMORY_SCOPE_NONE:
+            return Enums::MEMORY_SCOPE_NONE;
+          case BRIG_MEMORY_SCOPE_WORKITEM:
+            return Enums::MEMORY_SCOPE_WORKITEM;
+          case BRIG_MEMORY_SCOPE_WORKGROUP:
+            return Enums::MEMORY_SCOPE_WORKGROUP;
+          case BRIG_MEMORY_SCOPE_AGENT:
+            return Enums::MEMORY_SCOPE_DEVICE;
+          case BRIG_MEMORY_SCOPE_SYSTEM:
+            return Enums::MEMORY_SCOPE_SYSTEM;
+          default:
+            fatal("HsailISA::MemInst::getGenericMemoryScope -> ",
+                  "bad BrigMemoryScope\n");
+        }
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/generic_types.hh b/src/arch/hsail/generic_types.hh
new file mode 100644
index 000000000..50e430bef
--- /dev/null
+++ b/src/arch/hsail/generic_types.hh
@@ -0,0 +1,16 @@
+#ifndef __ARCH_HSAIL_GENERIC_TYPES_HH__
+#define __ARCH_HSAIL_GENERIC_TYPES_HH__
+
+#include "arch/hsail/Brig.h"
+#include "enums/GenericMemoryOrder.hh"
+#include "enums/GenericMemoryScope.hh"
+
+namespace HsailISA
+{
+    Enums::GenericMemoryOrder
+    getGenericMemoryOrder(Brig::BrigMemoryOrder brig_memory_order);
+    Enums::GenericMemoryScope
+    getGenericMemoryScope(Brig::BrigMemoryScope brig_memory_scope);
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_GENERIC_TYPES_HH__
diff --git a/src/arch/hsail/gpu_decoder.hh b/src/arch/hsail/gpu_decoder.hh
new file mode 100644
index 000000000..98a689664
--- /dev/null
+++ b/src/arch/hsail/gpu_decoder.hh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __ARCH_HSAIL_GPU_DECODER_HH__
+#define __ARCH_HSAIL_GPU_DECODER_HH__
+
+#include <vector>
+
+#include "arch/hsail/gpu_types.hh"
+
+class BrigObject;
+class GPUStaticInst;
+
+namespace Brig
+{
+    class BrigInstBase;
+}
+
+namespace HsailISA
+{
+    class Decoder
+    {
+      public:
+        GPUStaticInst* decode(MachInst machInst);
+
+        GPUStaticInst*
+        decode(RawMachInst inst)
+        {
+            return inst < decodedInsts.size() ? decodedInsts.at(inst) : nullptr;
+        }
+
+        RawMachInst
+        saveInst(GPUStaticInst *decodedInst)
+        {
+            decodedInsts.push_back(decodedInst);
+
+            return decodedInsts.size() - 1;
+        }
+
+      private:
+        static std::vector<GPUStaticInst*> decodedInsts;
+    };
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_GPU_DECODER_HH__
diff --git a/src/arch/hsail/gpu_types.hh b/src/arch/hsail/gpu_types.hh
new file mode 100644
index 000000000..4b3a66a9a
--- /dev/null
+++ b/src/arch/hsail/gpu_types.hh
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __ARCH_HSAIL_GPU_TYPES_HH__
+#define __ARCH_HSAIL_GPU_TYPES_HH__
+
+#include <cstdint>
+
+namespace Brig
+{
+    class BrigInstBase;
+}
+
+class BrigObject;
+
+namespace HsailISA
+{
+    // A raw machine instruction represents the raw bits that
+    // our model uses to represent an actual instruction. In
+    // the case of HSAIL this is just an index into a list of
+    // instruction objects.
+    typedef uint64_t RawMachInst;
+
+    // The MachInst is a representation of an instruction
+    // that has more information than just the machine code.
+    // For HSAIL the actual machine code is a BrigInstBase
+    // and the BrigObject contains more pertinent
+    // information related to operaands, etc.
+
+    struct MachInst
+    {
+        const Brig::BrigInstBase *brigInstBase;
+        const BrigObject *brigObj;
+    };
+}
+
+#endif // __ARCH_HSAIL_GPU_TYPES_HH__
diff --git a/src/arch/hsail/insts/branch.cc b/src/arch/hsail/insts/branch.cc
new file mode 100644
index 000000000..d65279cc8
--- /dev/null
+++ b/src/arch/hsail/insts/branch.cc
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "arch/hsail/insts/branch.hh"
+
+#include "gpu-compute/hsail_code.hh"
+
+namespace HsailISA
+{
+    GPUStaticInst*
+    decodeBrn(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        // Detect direct vs indirect branch by seeing whether we have a
+        // register operand.
+        unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+        const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+        if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            return new BrnIndirectInst(ib, obj);
+        } else {
+            return new BrnDirectInst(ib, obj);
+        }
+    }
+
+    GPUStaticInst*
+    decodeCbr(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        // Detect direct vs indirect branch by seeing whether we have a
+        // second register operand (after the condition).
+        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+        const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+        if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            return new CbrIndirectInst(ib, obj);
+        } else {
+            return new CbrDirectInst(ib, obj);
+        }
+    }
+
+    GPUStaticInst*
+    decodeBr(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        // Detect direct vs indirect branch by seeing whether we have a
+        // second register operand (after the condition).
+        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+        const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+        if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            return new BrIndirectInst(ib, obj);
+        } else {
+            return new BrDirectInst(ib, obj);
+        }
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/branch.hh b/src/arch/hsail/insts/branch.hh
new file mode 100644
index 000000000..54ad9a042
--- /dev/null
+++ b/src/arch/hsail/insts/branch.hh
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_BRANCH_HH__
+#define __ARCH_HSAIL_INSTS_BRANCH_HH__
+
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/wavefront.hh"
+
+namespace HsailISA
+{
+
+    // The main difference between a direct branch and an indirect branch
+    // is whether the target is a register or a label, so we can share a
+    // lot of code if we template the base implementation on that type.
+    template<typename TargetType>
+    class BrnInstBase : public HsailGPUStaticInst
+    {
+    public:
+        void generateDisassembly();
+
+        Brig::BrigWidth8_t width;
+        TargetType target;
+
+        BrnInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+           : HsailGPUStaticInst(obj, "brn")
+        {
+            o_type = Enums::OT_BRANCH;
+            width = ((Brig::BrigInstBr*)ib)->width;
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            target.init(op_offs, obj);
+            o_type = Enums::OT_BRANCH;
+        }
+
+        uint32_t getTargetPc()  override { return target.getTarget(0, 0); }
+
+        bool unconditionalJumpInstruction() override { return true; }
+        bool isVectorRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isScalarRegister();
+        }
+
+        bool isSrcOperand(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return true;
+        }
+
+        bool isDstOperand(int operandIndex) {
+            return false;
+        }
+
+        int getOperandSize(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.opSize();
+        }
+
+        int getRegisterIndex(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.regIndex();
+        }
+
+        int getNumOperands() {
+            return 1;
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    template<typename TargetType>
+    void
+    BrnInstBase<TargetType>::generateDisassembly()
+    {
+        std::string widthClause;
+
+        if (width != 1) {
+            widthClause = csprintf("_width(%d)", width);
+        }
+
+        disassembly = csprintf("%s%s %s", opcode, widthClause,
+                               target.disassemble());
+    }
+
+    template<typename TargetType>
+    void
+    BrnInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        if (getTargetPc() == w->rpc()) {
+            w->popFromReconvergenceStack();
+        } else {
+            // Rpc and execution mask remain the same
+            w->pc(getTargetPc());
+        }
+        w->discardFetch();
+    }
+
+    class BrnDirectInst : public BrnInstBase<LabelOperand>
+    {
+      public:
+        BrnDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : BrnInstBase<LabelOperand>(ib, obj)
+        {
+        }
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+    };
+
+    class BrnIndirectInst : public BrnInstBase<SRegOperand>
+    {
+      public:
+        BrnIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : BrnInstBase<SRegOperand>(ib, obj)
+        {
+        }
+        int numSrcRegOperands() { return target.isVectorRegister(); }
+        int numDstRegOperands() { return 0; }
+    };
+
+    GPUStaticInst* decodeBrn(const Brig::BrigInstBase *ib,
+                             const BrigObject *obj);
+
+    template<typename TargetType>
+    class CbrInstBase : public HsailGPUStaticInst
+    {
+      public:
+        void generateDisassembly();
+
+        Brig::BrigWidth8_t width;
+        CRegOperand cond;
+        TargetType target;
+
+        CbrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+           : HsailGPUStaticInst(obj, "cbr")
+        {
+            o_type = Enums::OT_BRANCH;
+            width = ((Brig::BrigInstBr *)ib)->width;
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            cond.init(op_offs, obj);
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            target.init(op_offs, obj);
+            o_type = Enums::OT_BRANCH;
+        }
+
+        uint32_t getTargetPc() override { return target.getTarget(0, 0); }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+        // Assumption: Target is operand 0, Condition Register is operand 1
+        bool isVectorRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            if (!operandIndex)
+                return target.isVectorRegister();
+            else
+                return false;
+        }
+        bool isCondRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            if (!operandIndex)
+                return target.isCondRegister();
+            else
+                return true;
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return target.isScalarRegister();
+            else
+                return false;
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == 0)
+                return true;
+            return false;
+        }
+        // both Condition Register and Target are source operands
+        bool isDstOperand(int operandIndex) {
+            return false;
+        }
+        int getOperandSize(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            if (!operandIndex)
+                return target.opSize();
+            else
+                return 1;
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            if (!operandIndex)
+                return target.regIndex();
+            else
+                return -1;
+         }
+
+        // Operands = Target, Condition Register
+        int getNumOperands() {
+            return 2;
+        }
+    };
+
+    template<typename TargetType>
+    void
+    CbrInstBase<TargetType>::generateDisassembly()
+    {
+        std::string widthClause;
+
+        if (width != 1) {
+            widthClause = csprintf("_width(%d)", width);
+        }
+
+        disassembly = csprintf("%s%s %s,%s", opcode, widthClause,
+                               cond.disassemble(), target.disassemble());
+    }
+
+    template<typename TargetType>
+    void
+    CbrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        const uint32_t curr_pc = w->pc();
+        const uint32_t curr_rpc = w->rpc();
+        const VectorMask curr_mask = w->execMask();
+
+        /**
+         * TODO: can we move this pop outside the instruction, and
+         * into the wavefront?
+         */
+        w->popFromReconvergenceStack();
+
+        // immediate post-dominator instruction
+        const uint32_t rpc = static_cast<uint32_t>(ipdInstNum());
+        if (curr_rpc != rpc) {
+            w->pushToReconvergenceStack(rpc, curr_rpc, curr_mask);
+        }
+
+        // taken branch
+        const uint32_t true_pc = getTargetPc();
+        VectorMask true_mask;
+        for (unsigned int lane = 0; lane < VSZ; ++lane) {
+            true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane];
+        }
+
+        // not taken branch
+        const uint32_t false_pc = curr_pc + 1;
+        assert(true_pc != false_pc);
+        if (false_pc != rpc && true_mask.count() < curr_mask.count()) {
+            VectorMask false_mask = curr_mask & ~true_mask;
+            w->pushToReconvergenceStack(false_pc, rpc, false_mask);
+        }
+
+        if (true_pc != rpc && true_mask.count()) {
+            w->pushToReconvergenceStack(true_pc, rpc, true_mask);
+        }
+        assert(w->pc() != curr_pc);
+        w->discardFetch();
+    }
+
+
+    class CbrDirectInst : public CbrInstBase<LabelOperand>
+    {
+      public:
+        CbrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : CbrInstBase<LabelOperand>(ib, obj)
+        {
+        }
+        // the source operand of a conditional branch is a Condition
+        // Register which is not stored in the VRF
+        // so we do not count it as a source-register operand
+        // even though, formally, it is one.
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+    };
+
+    class CbrIndirectInst : public CbrInstBase<SRegOperand>
+    {
+      public:
+        CbrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : CbrInstBase<SRegOperand>(ib, obj)
+        {
+        }
+        // one source operand of the conditional indirect branch is a Condition
+        // register which is not stored in the VRF so we do not count it
+        // as a source-register operand even though, formally, it is one.
+        int numSrcRegOperands() { return target.isVectorRegister(); }
+        int numDstRegOperands() { return 0; }
+    };
+
+    GPUStaticInst* decodeCbr(const Brig::BrigInstBase *ib,
+                             const BrigObject *obj);
+
+    template<typename TargetType>
+    class BrInstBase : public HsailGPUStaticInst
+    {
+      public:
+        void generateDisassembly();
+
+        ImmOperand<uint32_t> width;
+        TargetType target;
+
+        BrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+           : HsailGPUStaticInst(obj, "br")
+        {
+            o_type = Enums::OT_BRANCH;
+            width.init(((Brig::BrigInstBr *)ib)->width, obj);
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            target.init(op_offs, obj);
+            o_type = Enums::OT_BRANCH;
+        }
+
+        uint32_t getTargetPc() override { return target.getTarget(0, 0); }
+
+        bool unconditionalJumpInstruction() override { return true; }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+        bool isVectorRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return true;
+        }
+        bool isDstOperand(int operandIndex) { return false; }
+        int getOperandSize(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.regIndex();
+        }
+        int getNumOperands() { return 1; }
+    };
+
+    template<typename TargetType>
+    void
+    BrInstBase<TargetType>::generateDisassembly()
+    {
+        std::string widthClause;
+
+        if (width.bits != 1) {
+            widthClause = csprintf("_width(%d)", width.bits);
+        }
+
+        disassembly = csprintf("%s%s %s", opcode, widthClause,
+                               target.disassemble());
+    }
+
+    template<typename TargetType>
+    void
+    BrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        if (getTargetPc() == w->rpc()) {
+            w->popFromReconvergenceStack();
+        } else {
+            // Rpc and execution mask remain the same
+            w->pc(getTargetPc());
+        }
+        w->discardFetch();
+    }
+
+    class BrDirectInst : public BrInstBase<LabelOperand>
+    {
+      public:
+        BrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : BrInstBase<LabelOperand>(ib, obj)
+        {
+        }
+
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+    };
+
+    class BrIndirectInst : public BrInstBase<SRegOperand>
+    {
+      public:
+        BrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : BrInstBase<SRegOperand>(ib, obj)
+        {
+        }
+        int numSrcRegOperands() { return target.isVectorRegister(); }
+        int numDstRegOperands() { return 0; }
+    };
+
+    GPUStaticInst* decodeBr(const Brig::BrigInstBase *ib,
+                            const BrigObject *obj);
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_BRANCH_HH__
diff --git a/src/arch/hsail/insts/decl.hh b/src/arch/hsail/insts/decl.hh
new file mode 100644
index 000000000..e2da501b9
--- /dev/null
+++ b/src/arch/hsail/insts/decl.hh
@@ -0,0 +1,1106 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_DECL_HH__
+#define __ARCH_HSAIL_INSTS_DECL_HH__
+
+#include <cmath>
+
+#include "arch/hsail/generic_types.hh"
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+#include "debug/HSAIL.hh"
+#include "enums/OpType.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+
+namespace HsailISA
+{
+    template<typename _DestOperand, typename _SrcOperand>
+    class HsailOperandType
+    {
+      public:
+        typedef _DestOperand DestOperand;
+        typedef _SrcOperand SrcOperand;
+    };
+
+    typedef HsailOperandType<CRegOperand, CRegOrImmOperand> CRegOperandType;
+    typedef HsailOperandType<SRegOperand, SRegOrImmOperand> SRegOperandType;
+    typedef HsailOperandType<DRegOperand, DRegOrImmOperand> DRegOperandType;
+
+    // The IsBits parameter serves only to disambiguate tbhe B* types from
+    // the U* types, which otherwise would be identical (and
+    // indistinguishable).
+    template<typename _OperandType, typename _CType, Enums::MemType _memType,
+             vgpr_type _vgprType, int IsBits=0>
+    class HsailDataType
+    {
+      public:
+        typedef _OperandType OperandType;
+        typedef _CType CType;
+        static const Enums::MemType memType = _memType;
+        static const vgpr_type vgprType = _vgprType;
+        static const char *label;
+    };
+
+    typedef HsailDataType<CRegOperandType, bool, Enums::M_U8, VT_32, 1> B1;
+    typedef HsailDataType<SRegOperandType, uint8_t, Enums::M_U8, VT_32, 1> B8;
+
+    typedef HsailDataType<SRegOperandType, uint16_t,
+                          Enums::M_U16, VT_32, 1> B16;
+
+    typedef HsailDataType<SRegOperandType, uint32_t,
+                          Enums::M_U32, VT_32, 1> B32;
+
+    typedef HsailDataType<DRegOperandType, uint64_t,
+                          Enums::M_U64, VT_64, 1> B64;
+
+    typedef HsailDataType<SRegOperandType, int8_t, Enums::M_S8, VT_32> S8;
+    typedef HsailDataType<SRegOperandType, int16_t, Enums::M_S16, VT_32> S16;
+    typedef HsailDataType<SRegOperandType, int32_t, Enums::M_S32, VT_32> S32;
+    typedef HsailDataType<DRegOperandType, int64_t, Enums::M_S64, VT_64> S64;
+
+    typedef HsailDataType<SRegOperandType, uint8_t, Enums::M_U8, VT_32> U8;
+    typedef HsailDataType<SRegOperandType, uint16_t, Enums::M_U16, VT_32> U16;
+    typedef HsailDataType<SRegOperandType, uint32_t, Enums::M_U32, VT_32> U32;
+    typedef HsailDataType<DRegOperandType, uint64_t, Enums::M_U64, VT_64> U64;
+
+    typedef HsailDataType<SRegOperandType, float, Enums::M_F32, VT_32> F32;
+    typedef HsailDataType<DRegOperandType, double, Enums::M_F64, VT_64> F64;
+
+    template<typename DestOperandType, typename SrcOperandType,
+             int NumSrcOperands>
+    class CommonInstBase : public HsailGPUStaticInst
+    {
+      protected:
+        typename DestOperandType::DestOperand dest;
+        typename SrcOperandType::SrcOperand src[NumSrcOperands];
+
+        void
+        generateDisassembly()
+        {
+            disassembly = csprintf("%s%s %s", opcode, opcode_suffix(),
+                                   dest.disassemble());
+
+            for (int i = 0; i < NumSrcOperands; ++i) {
+                disassembly += ",";
+                disassembly += src[i].disassemble();
+            }
+        }
+
+        virtual std::string opcode_suffix() = 0;
+
+      public:
+        CommonInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                       const char *opcode)
+            : HsailGPUStaticInst(obj, opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+
+            dest.init(op_offs, obj);
+
+            for (int i = 0; i < NumSrcOperands; ++i) {
+                op_offs = obj->getOperandPtr(ib->operands, i + 1);
+                src[i].init(op_offs, obj);
+            }
+        }
+
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isVectorRegister();
+            else
+                return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isCondRegister();
+            else
+                return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isScalarRegister();
+            else
+                return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return true;
+            return false;
+        }
+
+        bool isDstOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex >= NumSrcOperands)
+                return true;
+            return false;
+        }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].opSize();
+            else
+                return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].regIndex();
+            else
+                return dest.regIndex();
+        }
+        int numSrcRegOperands() {
+            int operands = 0;
+            for (int i = 0; i < NumSrcOperands; i++) {
+                if (src[i].isVectorRegister() == true) {
+                    operands++;
+                }
+            }
+            return operands;
+        }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return NumSrcOperands + 1; }
+    };
+
+    template<typename DataType, int NumSrcOperands>
+    class ArithInst : public CommonInstBase<typename DataType::OperandType,
+                                            typename DataType::OperandType,
+                                            NumSrcOperands>
+    {
+      public:
+        std::string opcode_suffix() { return csprintf("_%s", DataType::label); }
+
+        ArithInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                  const char *opcode)
+            : CommonInstBase<typename DataType::OperandType,
+                             typename DataType::OperandType,
+                             NumSrcOperands>(ib, obj, opcode)
+        {
+        }
+    };
+
+    template<typename DestOperandType, typename Src0OperandType,
+             typename Src1OperandType, typename Src2OperandType>
+    class ThreeNonUniformSourceInstBase : public HsailGPUStaticInst
+    {
+      protected:
+        typename DestOperandType::DestOperand dest;
+        typename Src0OperandType::SrcOperand  src0;
+        typename Src1OperandType::SrcOperand  src1;
+        typename Src2OperandType::SrcOperand  src2;
+
+        void
+        generateDisassembly()
+        {
+            disassembly = csprintf("%s %s,%s,%s,%s", opcode, dest.disassemble(),
+                                   src0.disassemble(), src1.disassemble(),
+                                   src2.disassemble());
+        }
+
+      public:
+        ThreeNonUniformSourceInstBase(const Brig::BrigInstBase *ib,
+                                      const BrigObject *obj,
+                                      const char *opcode)
+            : HsailGPUStaticInst(obj, opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src0.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 2);
+            src1.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 3);
+            src2.init(op_offs, obj);
+        }
+
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isVectorRegister();
+            else if (operandIndex == 1)
+                return src1.isVectorRegister();
+            else if (operandIndex == 2)
+                return src2.isVectorRegister();
+            else
+                return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isCondRegister();
+            else if (operandIndex == 1)
+                return src1.isCondRegister();
+            else if (operandIndex == 2)
+                return src2.isCondRegister();
+            else
+                return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isScalarRegister();
+            else if (operandIndex == 1)
+                return src1.isScalarRegister();
+            else if (operandIndex == 2)
+                return src2.isScalarRegister();
+            else
+                return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < 3)
+                return true;
+            else
+                return false;
+        }
+        bool isDstOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex >= 3)
+                return true;
+            else
+                return false;
+        }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.opSize();
+            else if (operandIndex == 1)
+                return src1.opSize();
+            else if (operandIndex == 2)
+                return src2.opSize();
+            else
+                return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.regIndex();
+            else if (operandIndex == 1)
+                return src1.regIndex();
+            else if (operandIndex == 2)
+                return src2.regIndex();
+            else
+                return dest.regIndex();
+        }
+
+        int numSrcRegOperands() {
+            int operands = 0;
+            if (src0.isVectorRegister() == true) {
+                operands++;
+            }
+            if (src1.isVectorRegister() == true) {
+                operands++;
+            }
+            if (src2.isVectorRegister() == true) {
+                operands++;
+            }
+            return operands;
+        }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return 4; }
+    };
+
+    template<typename DestDataType, typename Src0DataType,
+             typename Src1DataType, typename Src2DataType>
+    class ThreeNonUniformSourceInst :
+        public ThreeNonUniformSourceInstBase<typename DestDataType::OperandType,
+                                             typename Src0DataType::OperandType,
+                                             typename Src1DataType::OperandType,
+                                             typename Src2DataType::OperandType>
+    {
+      public:
+        typedef typename DestDataType::CType DestCType;
+        typedef typename Src0DataType::CType Src0CType;
+        typedef typename Src1DataType::CType Src1CType;
+        typedef typename Src2DataType::CType Src2CType;
+
+        ThreeNonUniformSourceInst(const Brig::BrigInstBase *ib,
+                                  const BrigObject *obj, const char *opcode)
+            : ThreeNonUniformSourceInstBase<typename DestDataType::OperandType,
+                                         typename Src0DataType::OperandType,
+                                         typename Src1DataType::OperandType,
+                                         typename Src2DataType::OperandType>(ib,
+                                                                    obj, opcode)
+        {
+        }
+    };
+
+    template<typename DataType>
+    class CmovInst : public ThreeNonUniformSourceInst<DataType, B1,
+                                                      DataType, DataType>
+    {
+      public:
+        CmovInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                 const char *opcode)
+            : ThreeNonUniformSourceInst<DataType, B1, DataType,
+                                        DataType>(ib, obj, opcode)
+        {
+        }
+    };
+
+    template<typename DataType>
+    class ExtractInsertInst : public ThreeNonUniformSourceInst<DataType,
+                                                               DataType, U32,
+                                                               U32>
+    {
+      public:
+        ExtractInsertInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                          const char *opcode)
+            : ThreeNonUniformSourceInst<DataType, DataType, U32,
+                                        U32>(ib, obj, opcode)
+        {
+        }
+    };
+
+    template<typename DestOperandType, typename Src0OperandType,
+             typename Src1OperandType>
+    class TwoNonUniformSourceInstBase : public HsailGPUStaticInst
+    {
+      protected:
+        typename DestOperandType::DestOperand dest;
+        typename Src0OperandType::SrcOperand src0;
+        typename Src1OperandType::SrcOperand src1;
+
+        void
+        generateDisassembly()
+        {
+            disassembly = csprintf("%s %s,%s,%s", opcode, dest.disassemble(),
+                                   src0.disassemble(), src1.disassemble());
+        }
+
+
+      public:
+        TwoNonUniformSourceInstBase(const Brig::BrigInstBase *ib,
+                                    const BrigObject *obj, const char *opcode)
+            : HsailGPUStaticInst(obj, opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src0.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 2);
+            src1.init(op_offs, obj);
+        }
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isVectorRegister();
+            else if (operandIndex == 1)
+                return src1.isVectorRegister();
+            else
+                return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isCondRegister();
+            else if (operandIndex == 1)
+                return src1.isCondRegister();
+            else
+                return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isScalarRegister();
+            else if (operandIndex == 1)
+                return src1.isScalarRegister();
+            else
+                return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < 2)
+                return true;
+            else
+                return false;
+        }
+        bool isDstOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex >= 2)
+                return true;
+            else
+                return false;
+        }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.opSize();
+            else if (operandIndex == 1)
+                return src1.opSize();
+            else
+                return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.regIndex();
+            else if (operandIndex == 1)
+                return src1.regIndex();
+            else
+                return dest.regIndex();
+        }
+
+        int numSrcRegOperands() {
+            int operands = 0;
+            if (src0.isVectorRegister() == true) {
+                operands++;
+            }
+            if (src1.isVectorRegister() == true) {
+                operands++;
+            }
+            return operands;
+        }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return 3; }
+    };
+
+    template<typename DestDataType, typename Src0DataType,
+             typename Src1DataType>
+    class TwoNonUniformSourceInst :
+        public TwoNonUniformSourceInstBase<typename DestDataType::OperandType,
+                                           typename Src0DataType::OperandType,
+                                           typename Src1DataType::OperandType>
+    {
+      public:
+        typedef typename DestDataType::CType DestCType;
+        typedef typename Src0DataType::CType Src0CType;
+        typedef typename Src1DataType::CType Src1CType;
+
+        TwoNonUniformSourceInst(const Brig::BrigInstBase *ib,
+                                const BrigObject *obj, const char *opcode)
+            : TwoNonUniformSourceInstBase<typename DestDataType::OperandType,
+                                         typename Src0DataType::OperandType,
+                                         typename Src1DataType::OperandType>(ib,
+                                                                    obj, opcode)
+        {
+        }
+    };
+
+    // helper function for ClassInst
+    template<typename T>
+    bool
+    fpclassify(T src0, uint32_t src1)
+    {
+        int fpclass = std::fpclassify(src0);
+
+        if ((src1 & 0x3) && (fpclass == FP_NAN)) {
+            return true;
+        }
+
+        if (src0 <= -0.0) {
+            if ((src1 & 0x4) && fpclass == FP_INFINITE)
+                return true;
+            if ((src1 & 0x8) && fpclass == FP_NORMAL)
+                return true;
+            if ((src1 & 0x10) && fpclass == FP_SUBNORMAL)
+                return true;
+            if ((src1 & 0x20) && fpclass == FP_ZERO)
+                return true;
+        } else {
+            if ((src1 & 0x40) && fpclass == FP_ZERO)
+                return true;
+            if ((src1 & 0x80) && fpclass == FP_SUBNORMAL)
+                return true;
+            if ((src1 & 0x100) && fpclass == FP_NORMAL)
+                return true;
+            if ((src1 & 0x200) && fpclass == FP_INFINITE)
+                return true;
+        }
+        return false;
+    }
+
+    template<typename DataType>
+    class ClassInst : public TwoNonUniformSourceInst<B1, DataType, U32>
+    {
+      public:
+        ClassInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                  const char *opcode)
+            : TwoNonUniformSourceInst<B1, DataType, U32>(ib, obj, opcode)
+        {
+        }
+    };
+
+    template<typename DataType>
+    class ShiftInst : public TwoNonUniformSourceInst<DataType, DataType, U32>
+    {
+      public:
+        ShiftInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                  const char *opcode)
+            : TwoNonUniformSourceInst<DataType, DataType, U32>(ib, obj, opcode)
+        {
+        }
+    };
+
+    // helper function for CmpInst
+    template<typename T>
+    bool
+    compare(T src0, T src1, Brig::BrigCompareOperation cmpOp)
+    {
+        using namespace Brig;
+
+        switch (cmpOp) {
+          case BRIG_COMPARE_EQ:
+          case BRIG_COMPARE_EQU:
+          case BRIG_COMPARE_SEQ:
+          case BRIG_COMPARE_SEQU:
+            return (src0 == src1);
+
+          case BRIG_COMPARE_NE:
+          case BRIG_COMPARE_NEU:
+          case BRIG_COMPARE_SNE:
+          case BRIG_COMPARE_SNEU:
+            return (src0 != src1);
+
+          case BRIG_COMPARE_LT:
+          case BRIG_COMPARE_LTU:
+          case BRIG_COMPARE_SLT:
+          case BRIG_COMPARE_SLTU:
+            return (src0 < src1);
+
+          case BRIG_COMPARE_LE:
+          case BRIG_COMPARE_LEU:
+          case BRIG_COMPARE_SLE:
+          case BRIG_COMPARE_SLEU:
+            return (src0 <= src1);
+
+          case BRIG_COMPARE_GT:
+          case BRIG_COMPARE_GTU:
+          case BRIG_COMPARE_SGT:
+          case BRIG_COMPARE_SGTU:
+            return (src0 > src1);
+
+          case BRIG_COMPARE_GE:
+          case BRIG_COMPARE_GEU:
+          case BRIG_COMPARE_SGE:
+          case BRIG_COMPARE_SGEU:
+            return (src0 >= src1);
+
+          case BRIG_COMPARE_NUM:
+          case BRIG_COMPARE_SNUM:
+            return (src0 == src0) || (src1 == src1);
+
+          case BRIG_COMPARE_NAN:
+          case BRIG_COMPARE_SNAN:
+            return (src0 != src0) || (src1 != src1);
+
+          default:
+            fatal("Bad cmpOp value %d\n", (int)cmpOp);
+        }
+    }
+
+    template<typename T>
+    int32_t
+    firstbit(T src0)
+    {
+        if (!src0)
+            return -1;
+
+        //handle positive and negative numbers
+        T tmp = (src0 < 0) ? (~src0) : (src0);
+
+        //the starting pos is MSB
+        int pos = 8 * sizeof(T) - 1;
+        int cnt = 0;
+
+        //search the first bit set to 1
+        while (!(tmp & (1 << pos))) {
+            ++cnt;
+            --pos;
+        }
+        return cnt;
+    }
+
+    const char* cmpOpToString(Brig::BrigCompareOperation cmpOp);
+
+    template<typename DestOperandType, typename SrcOperandType>
+    class CmpInstBase : public CommonInstBase<DestOperandType, SrcOperandType,
+                                              2>
+    {
+      protected:
+        Brig::BrigCompareOperation cmpOp;
+
+      public:
+        CmpInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                    const char *_opcode)
+            : CommonInstBase<DestOperandType, SrcOperandType, 2>(ib, obj,
+                                                                 _opcode)
+        {
+            assert(ib->base.kind == Brig::BRIG_KIND_INST_CMP);
+            Brig::BrigInstCmp *i = (Brig::BrigInstCmp*)ib;
+            cmpOp = (Brig::BrigCompareOperation)i->compare;
+        }
+    };
+
+    template<typename DestDataType, typename SrcDataType>
+    class CmpInst : public CmpInstBase<typename DestDataType::OperandType,
+                                       typename SrcDataType::OperandType>
+    {
+      public:
+        std::string
+        opcode_suffix()
+        {
+            return csprintf("_%s_%s_%s", cmpOpToString(this->cmpOp),
+                            DestDataType::label, SrcDataType::label);
+        }
+
+        CmpInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                const char *_opcode)
+            : CmpInstBase<typename DestDataType::OperandType,
+                          typename SrcDataType::OperandType>(ib, obj, _opcode)
+        {
+        }
+    };
+
+    template<typename DestDataType, typename SrcDataType>
+    class CvtInst : public CommonInstBase<typename DestDataType::OperandType,
+                                          typename SrcDataType::OperandType, 1>
+    {
+      public:
+        std::string opcode_suffix()
+        {
+            return csprintf("_%s_%s", DestDataType::label, SrcDataType::label);
+        }
+
+        CvtInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                const char *_opcode)
+            : CommonInstBase<typename DestDataType::OperandType,
+                             typename SrcDataType::OperandType,
+                             1>(ib, obj, _opcode)
+        {
+        }
+    };
+
+    class SpecialInstNoSrcNoDest : public HsailGPUStaticInst
+    {
+      public:
+        SpecialInstNoSrcNoDest(const Brig::BrigInstBase *ib,
+                               const BrigObject *obj, const char *_opcode)
+            : HsailGPUStaticInst(obj, _opcode)
+        {
+        }
+
+        bool isVectorRegister(int operandIndex) { return false; }
+        bool isCondRegister(int operandIndex) { return false; }
+        bool isScalarRegister(int operandIndex) { return false; }
+        bool isSrcOperand(int operandIndex) { return false; }
+        bool isDstOperand(int operandIndex) { return false; }
+        int getOperandSize(int operandIndex) { return 0; }
+        int getRegisterIndex(int operandIndex) { return -1; }
+
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+        int getNumOperands() { return 0; }
+    };
+
+    template<typename DestOperandType>
+    class SpecialInstNoSrcBase : public HsailGPUStaticInst
+    {
+      protected:
+        typename DestOperandType::DestOperand dest;
+
+        void generateDisassembly()
+        {
+            disassembly = csprintf("%s %s", opcode, dest.disassemble());
+        }
+
+      public:
+        SpecialInstNoSrcBase(const Brig::BrigInstBase *ib,
+                             const BrigObject *obj, const char *_opcode)
+            : HsailGPUStaticInst(obj, _opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+        }
+
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) { return false; }
+        bool isDstOperand(int operandIndex) { return true; }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.regIndex();
+        }
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return 1; }
+    };
+
+    template<typename DestDataType>
+    class SpecialInstNoSrc :
+        public SpecialInstNoSrcBase<typename DestDataType::OperandType>
+    {
+      public:
+        typedef typename DestDataType::CType DestCType;
+
+        SpecialInstNoSrc(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                         const char *_opcode)
+            : SpecialInstNoSrcBase<typename DestDataType::OperandType>(ib, obj,
+                                                                       _opcode)
+        {
+        }
+    };
+
+    template<typename DestOperandType>
+    class SpecialInst1SrcBase : public HsailGPUStaticInst
+    {
+      protected:
+        typedef int SrcCType;  // used in execute() template
+
+        typename DestOperandType::DestOperand dest;
+        ImmOperand<SrcCType> src0;
+
+        void
+        generateDisassembly()
+        {
+            disassembly = csprintf("%s %s,%s", opcode, dest.disassemble(),
+                                   src0.disassemble());
+        }
+
+      public:
+        SpecialInst1SrcBase(const Brig::BrigInstBase *ib,
+                            const BrigObject *obj, const char *_opcode)
+            : HsailGPUStaticInst(obj, _opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src0.init(op_offs, obj);
+        }
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) { return false; }
+        bool isDstOperand(int operandIndex) { return true; }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.regIndex();
+        }
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return 1; }
+    };
+
+    template<typename DestDataType>
+    class SpecialInst1Src :
+        public SpecialInst1SrcBase<typename DestDataType::OperandType>
+    {
+      public:
+        typedef typename DestDataType::CType DestCType;
+
+        SpecialInst1Src(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                        const char *_opcode)
+            : SpecialInst1SrcBase<typename DestDataType::OperandType>(ib, obj,
+                                                                      _opcode)
+        {
+        }
+    };
+
+    class Ret : public SpecialInstNoSrcNoDest
+    {
+      public:
+        typedef SpecialInstNoSrcNoDest Base;
+
+        Ret(const Brig::BrigInstBase *ib, const BrigObject *obj)
+           : Base(ib, obj, "ret")
+        {
+            o_type = Enums::OT_RET;
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    class Barrier : public SpecialInstNoSrcNoDest
+    {
+      public:
+        typedef SpecialInstNoSrcNoDest Base;
+        uint8_t width;
+
+        Barrier(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : Base(ib, obj, "barrier")
+        {
+            o_type = Enums::OT_BARRIER;
+            assert(ib->base.kind == Brig::BRIG_KIND_INST_BR);
+            width = (uint8_t)((Brig::BrigInstBr*)ib)->width;
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    class MemFence : public SpecialInstNoSrcNoDest
+    {
+      public:
+        typedef SpecialInstNoSrcNoDest Base;
+
+        Brig::BrigMemoryOrder memFenceMemOrder;
+        Brig::BrigMemoryScope memFenceScopeSegGroup;
+        Brig::BrigMemoryScope memFenceScopeSegGlobal;
+        Brig::BrigMemoryScope memFenceScopeSegImage;
+
+        MemFence(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : Base(ib, obj, "memfence")
+        {
+            assert(ib->base.kind == Brig::BRIG_KIND_INST_MEM_FENCE);
+
+            memFenceScopeSegGlobal = (Brig::BrigMemoryScope)
+                ((Brig::BrigInstMemFence*)ib)->globalSegmentMemoryScope;
+
+            memFenceScopeSegGroup = (Brig::BrigMemoryScope)
+                ((Brig::BrigInstMemFence*)ib)->groupSegmentMemoryScope;
+
+            memFenceScopeSegImage = (Brig::BrigMemoryScope)
+                ((Brig::BrigInstMemFence*)ib)->imageSegmentMemoryScope;
+
+            memFenceMemOrder = (Brig::BrigMemoryOrder)
+                ((Brig::BrigInstMemFence*)ib)->memoryOrder;
+
+            // set o_type based on scopes
+            if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE &&
+                memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) {
+                o_type = Enums::OT_BOTH_MEMFENCE;
+            } else if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE) {
+                o_type = Enums::OT_GLOBAL_MEMFENCE;
+            } else if (memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) {
+                o_type = Enums::OT_SHARED_MEMFENCE;
+            } else {
+                fatal("MemFence constructor: bad scope specifiers\n");
+            }
+        }
+
+        void
+        initiateAcc(GPUDynInstPtr gpuDynInst)
+        {
+            Wavefront *wave = gpuDynInst->wavefront();
+            wave->computeUnit->injectGlobalMemFence(gpuDynInst);
+        }
+
+        void
+        execute(GPUDynInstPtr gpuDynInst)
+        {
+            Wavefront *w = gpuDynInst->wavefront();
+            // 2 cases:
+            //   * memfence to a sequentially consistent memory (e.g., LDS).
+            //     These can be handled as no-ops.
+            //   * memfence to a relaxed consistency cache (e.g., Hermes, Viper,
+            //     etc.). We send a packet, tagged with the memory order and
+            //     scope, and let the GPU coalescer handle it.
+
+            if (o_type == Enums::OT_GLOBAL_MEMFENCE ||
+                o_type == Enums::OT_BOTH_MEMFENCE) {
+                gpuDynInst->simdId = w->simdId;
+                gpuDynInst->wfSlotId = w->wfSlotId;
+                gpuDynInst->wfDynId = w->wfDynId;
+                gpuDynInst->kern_id = w->kern_id;
+                gpuDynInst->cu_id = w->computeUnit->cu_id;
+
+                gpuDynInst->memoryOrder =
+                    getGenericMemoryOrder(memFenceMemOrder);
+                gpuDynInst->scope =
+                    getGenericMemoryScope(memFenceScopeSegGlobal);
+                gpuDynInst->useContinuation = false;
+                GlobalMemPipeline* gmp = &(w->computeUnit->globalMemoryPipe);
+                gmp->getGMReqFIFO().push(gpuDynInst);
+
+                w->wr_gm_reqs_in_pipe--;
+                w->rd_gm_reqs_in_pipe--;
+                w->mem_reqs_in_pipe--;
+                w->outstanding_reqs++;
+            } else if (o_type == Enums::OT_SHARED_MEMFENCE) {
+                // no-op
+            } else {
+                fatal("MemFence execute: bad o_type\n");
+            }
+        }
+    };
+
+    class Call : public HsailGPUStaticInst
+    {
+      public:
+        // private helper functions
+        void calcAddr(Wavefront* w, GPUDynInstPtr m);
+
+        void
+        generateDisassembly()
+        {
+            if (dest.disassemble() == "") {
+                disassembly = csprintf("%s %s (%s)", opcode, src0.disassemble(),
+                                       src1.disassemble());
+            } else {
+                disassembly = csprintf("%s %s (%s) (%s)", opcode,
+                                       src0.disassemble(), dest.disassemble(),
+                                       src1.disassemble());
+            }
+        }
+
+        bool
+        isPseudoOp()
+        {
+            std::string func_name = src0.disassemble();
+            if (func_name.find("__gem5_hsail_op") != std::string::npos) {
+                return true;
+            }
+            return false;
+        }
+
+        // member variables
+        ListOperand dest;
+        FunctionRefOperand src0;
+        ListOperand src1;
+        HsailCode *func_ptr;
+
+        // exec function for pseudo instructions mapped on top of call opcode
+        void execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst);
+
+        // user-defined pseudo instructions
+        void MagicPrintLane(Wavefront *w);
+        void MagicPrintLane64(Wavefront *w);
+        void MagicPrintWF32(Wavefront *w);
+        void MagicPrintWF64(Wavefront *w);
+        void MagicPrintWFFloat(Wavefront *w);
+        void MagicSimBreak(Wavefront *w);
+        void MagicPrefixSum(Wavefront *w);
+        void MagicReduction(Wavefront *w);
+        void MagicMaskLower(Wavefront *w);
+        void MagicMaskUpper(Wavefront *w);
+        void MagicJoinWFBar(Wavefront *w);
+        void MagicWaitWFBar(Wavefront *w);
+        void MagicPanic(Wavefront *w);
+
+        void MagicAtomicNRAddGlobalU32Reg(Wavefront *w,
+                                          GPUDynInstPtr gpuDynInst);
+
+        void MagicAtomicNRAddGroupU32Reg(Wavefront *w,
+                                         GPUDynInstPtr gpuDynInst);
+
+        void MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst);
+
+        void MagicXactCasLd(Wavefront *w);
+        void MagicMostSigThread(Wavefront *w);
+        void MagicMostSigBroadcast(Wavefront *w);
+
+        void MagicPrintWF32ID(Wavefront *w);
+        void MagicPrintWFID64(Wavefront *w);
+
+        Call(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : HsailGPUStaticInst(obj, "call")
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src0.init(op_offs, obj);
+
+            func_ptr = nullptr;
+            std::string func_name = src0.disassemble();
+            if (!isPseudoOp()) {
+                func_ptr = dynamic_cast<HsailCode*>(obj->
+                                                    getFunction(func_name));
+
+                if (!func_ptr)
+                    fatal("call::exec cannot find function: %s\n", func_name);
+            }
+
+            op_offs = obj->getOperandPtr(ib->operands, 2);
+            src1.init(op_offs, obj);
+        }
+
+        bool isVectorRegister(int operandIndex) { return false; }
+        bool isCondRegister(int operandIndex) { return false; }
+        bool isScalarRegister(int operandIndex) { return false; }
+        bool isSrcOperand(int operandIndex) { return false; }
+        bool isDstOperand(int operandIndex) { return false; }
+        int  getOperandSize(int operandIndex) { return 0; }
+        int  getRegisterIndex(int operandIndex) { return -1; }
+
+        void
+        execute(GPUDynInstPtr gpuDynInst)
+        {
+            Wavefront *w = gpuDynInst->wavefront();
+
+            std::string func_name = src0.disassemble();
+            if (isPseudoOp()) {
+                execPseudoInst(w, gpuDynInst);
+            } else {
+                fatal("Native HSAIL functions are not yet implemented: %s\n",
+                      func_name);
+            }
+        }
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+        int getNumOperands() { return 2; }
+    };
+
+    template<typename T> T heynot(T arg) { return ~arg; }
+    template<> inline bool heynot<bool>(bool arg) { return !arg; }
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_DECL_HH__
diff --git a/src/arch/hsail/insts/gpu_static_inst.cc b/src/arch/hsail/insts/gpu_static_inst.cc
new file mode 100644
index 000000000..bbaeb13e6
--- /dev/null
+++ b/src/arch/hsail/insts/gpu_static_inst.cc
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "arch/hsail/insts/gpu_static_inst.hh"
+
+#include "gpu-compute/brig_object.hh"
+
+namespace HsailISA
+{
+    HsailGPUStaticInst::HsailGPUStaticInst(const BrigObject *obj,
+                                           const std::string &opcode)
+        : GPUStaticInst(opcode), hsailCode(obj->currentCode)
+    {
+    }
+
+    void
+    HsailGPUStaticInst::generateDisassembly()
+    {
+        disassembly = opcode;
+    }
+
+    const std::string&
+    HsailGPUStaticInst::disassemble()
+    {
+        if (disassembly.empty()) {
+            generateDisassembly();
+            assert(!disassembly.empty());
+        }
+
+        return disassembly;
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/gpu_static_inst.hh b/src/arch/hsail/insts/gpu_static_inst.hh
new file mode 100644
index 000000000..29aab1f70
--- /dev/null
+++ b/src/arch/hsail/insts/gpu_static_inst.hh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
+#define __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
+
+/*
+ * @file gpu_static_inst.hh
+ *
+ * Defines the base class representing HSAIL GPU static instructions.
+ */
+
+#include "gpu-compute/gpu_static_inst.hh"
+
+class BrigObject;
+class HsailCode;
+
+namespace HsailISA
+{
+    class HsailGPUStaticInst : public GPUStaticInst
+    {
+      public:
+        HsailGPUStaticInst(const BrigObject *obj, const std::string &opcode);
+        void generateDisassembly();
+        const std::string &disassemble();
+        uint32_t instSize() { return 4; }
+
+      protected:
+        HsailCode *hsailCode;
+    };
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
diff --git a/src/arch/hsail/insts/main.cc b/src/arch/hsail/insts/main.cc
new file mode 100644
index 000000000..4e70bf46a
--- /dev/null
+++ b/src/arch/hsail/insts/main.cc
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/insts/decl.hh"
+#include "debug/GPUExec.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/simple_pool_manager.hh"
+
+namespace HsailISA
+{
+    template<> const char *B1::label = "b1";
+    template<> const char *B8::label = "b8";
+    template<> const char *B16::label = "b16";
+    template<> const char *B32::label = "b32";
+    template<> const char *B64::label = "b64";
+
+    template<> const char *S8::label = "s8";
+    template<> const char *S16::label = "s16";
+    template<> const char *S32::label = "s32";
+    template<> const char *S64::label = "s64";
+
+    template<> const char *U8::label = "u8";
+    template<> const char *U16::label = "u16";
+    template<> const char *U32::label = "u32";
+    template<> const char *U64::label = "u64";
+
+    template<> const char *F32::label = "f32";
+    template<> const char *F64::label = "f64";
+
+    const char*
+    cmpOpToString(Brig::BrigCompareOperation cmpOp)
+    {
+        using namespace Brig;
+
+        switch (cmpOp) {
+          case BRIG_COMPARE_EQ:
+            return "eq";
+          case BRIG_COMPARE_NE:
+            return "ne";
+          case BRIG_COMPARE_LT:
+            return "lt";
+          case BRIG_COMPARE_LE:
+            return "le";
+          case BRIG_COMPARE_GT:
+            return "gt";
+          case BRIG_COMPARE_GE:
+            return "ge";
+          case BRIG_COMPARE_EQU:
+            return "equ";
+          case BRIG_COMPARE_NEU:
+            return "neu";
+          case BRIG_COMPARE_LTU:
+            return "ltu";
+          case BRIG_COMPARE_LEU:
+            return "leu";
+          case BRIG_COMPARE_GTU:
+            return "gtu";
+          case BRIG_COMPARE_GEU:
+            return "geu";
+          case BRIG_COMPARE_NUM:
+            return "num";
+          case BRIG_COMPARE_NAN:
+            return "nan";
+          case BRIG_COMPARE_SEQ:
+            return "seq";
+          case BRIG_COMPARE_SNE:
+            return "sne";
+          case BRIG_COMPARE_SLT:
+            return "slt";
+          case BRIG_COMPARE_SLE:
+            return "sle";
+          case BRIG_COMPARE_SGT:
+            return "sgt";
+          case BRIG_COMPARE_SGE:
+            return "sge";
+          case BRIG_COMPARE_SGEU:
+            return "sgeu";
+          case BRIG_COMPARE_SEQU:
+            return "sequ";
+          case BRIG_COMPARE_SNEU:
+            return "sneu";
+          case BRIG_COMPARE_SLTU:
+            return "sltu";
+          case BRIG_COMPARE_SLEU:
+            return "sleu";
+          case BRIG_COMPARE_SNUM:
+            return "snum";
+          case BRIG_COMPARE_SNAN:
+            return "snan";
+          case BRIG_COMPARE_SGTU:
+            return "sgtu";
+          default:
+            return "unknown";
+        }
+    }
+
+    void
+    Ret::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        const VectorMask &mask = w->get_pred();
+
+        // mask off completed work-items
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                w->init_mask[lane] = 0;
+            }
+
+        }
+
+        // delete extra instructions fetched for completed work-items
+        w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
+                                   w->instructionBuffer.end());
+        if (w->pendingFetch) {
+            w->dropFetch = true;
+        }
+
+        // if all work-items have completed, then wave-front is done
+        if (w->init_mask.none()) {
+            w->status = Wavefront::S_STOPPED;
+
+            int32_t refCount = w->computeUnit->getLds().
+                                   decreaseRefCounter(w->dispatchid, w->wg_id);
+
+            DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
+                            w->computeUnit->cu_id, w->wg_id, refCount);
+
+            // free the vector registers of the completed wavefront
+            w->computeUnit->vectorRegsReserved[w->simdId] -=
+                w->reservedVectorRegs;
+
+            assert(w->computeUnit->vectorRegsReserved[w->simdId] >= 0);
+
+            uint32_t endIndex = (w->startVgprIndex +
+                                 w->reservedVectorRegs - 1) %
+                w->computeUnit->vrf[w->simdId]->numRegs();
+
+            w->computeUnit->vrf[w->simdId]->manager->
+                freeRegion(w->startVgprIndex, endIndex);
+
+            w->reservedVectorRegs = 0;
+            w->startVgprIndex = 0;
+            w->computeUnit->completedWfs++;
+
+            DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
+                    w->computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId);
+
+            if (!refCount) {
+                // Notify Memory System of Kernel Completion
+                // Kernel End = isKernel + isRelease
+                w->status = Wavefront::S_RETURNING;
+                GPUDynInstPtr local_mempacket = gpuDynInst;
+                local_mempacket->memoryOrder = Enums::MEMORY_ORDER_SC_RELEASE;
+                local_mempacket->scope = Enums::MEMORY_SCOPE_SYSTEM;
+                local_mempacket->useContinuation = false;
+                local_mempacket->simdId = w->simdId;
+                local_mempacket->wfSlotId = w->wfSlotId;
+                local_mempacket->wfDynId = w->wfDynId;
+                w->computeUnit->injectGlobalMemFence(local_mempacket, true);
+            } else {
+                w->computeUnit->shader->dispatcher->scheduleDispatch();
+            }
+        }
+    }
+
+    void
+    Barrier::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        assert(w->barrier_cnt == w->old_barrier_cnt);
+        w->barrier_cnt = w->old_barrier_cnt + 1;
+        w->stalledAtBarrier = true;
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/mem.cc b/src/arch/hsail/insts/mem.cc
new file mode 100644
index 000000000..97d4c902b
--- /dev/null
+++ b/src/arch/hsail/insts/mem.cc
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/insts/mem.hh"
+
+#include "arch/hsail/Brig.h"
+#include "enums/OpType.hh"
+
+using namespace Brig;
+
+namespace HsailISA
+{
+    const char* atomicOpToString(BrigAtomicOperation brigOp);
+
+    Enums::MemOpType
+    brigAtomicToMemOpType(BrigOpcode brigOpCode, BrigAtomicOperation brigOp)
+    {
+        if (brigOpCode == Brig::BRIG_OPCODE_ATOMIC) {
+            switch (brigOp) {
+              case BRIG_ATOMIC_AND:
+                return Enums::MO_AAND;
+              case BRIG_ATOMIC_OR:
+                return Enums::MO_AOR;
+              case BRIG_ATOMIC_XOR:
+                return Enums::MO_AXOR;
+              case BRIG_ATOMIC_CAS:
+                return Enums::MO_ACAS;
+              case BRIG_ATOMIC_EXCH:
+                return Enums::MO_AEXCH;
+              case BRIG_ATOMIC_ADD:
+                return Enums::MO_AADD;
+              case BRIG_ATOMIC_WRAPINC:
+                return Enums::MO_AINC;
+              case BRIG_ATOMIC_WRAPDEC:
+                return Enums::MO_ADEC;
+              case BRIG_ATOMIC_MIN:
+                return Enums::MO_AMIN;
+              case BRIG_ATOMIC_MAX:
+                return Enums::MO_AMAX;
+              case BRIG_ATOMIC_SUB:
+                return Enums::MO_ASUB;
+              default:
+                fatal("Bad BrigAtomicOperation code %d\n", brigOp);
+            }
+        } else if (brigOpCode == Brig::BRIG_OPCODE_ATOMICNORET) {
+            switch (brigOp) {
+              case BRIG_ATOMIC_AND:
+                  return Enums::MO_ANRAND;
+              case BRIG_ATOMIC_OR:
+                  return Enums::MO_ANROR;
+              case BRIG_ATOMIC_XOR:
+                  return Enums::MO_ANRXOR;
+              case BRIG_ATOMIC_CAS:
+                  return Enums::MO_ANRCAS;
+              case BRIG_ATOMIC_EXCH:
+                  return Enums::MO_ANREXCH;
+              case BRIG_ATOMIC_ADD:
+                  return Enums::MO_ANRADD;
+              case BRIG_ATOMIC_WRAPINC:
+                  return Enums::MO_ANRINC;
+              case BRIG_ATOMIC_WRAPDEC:
+                  return Enums::MO_ANRDEC;
+              case BRIG_ATOMIC_MIN:
+                  return Enums::MO_ANRMIN;
+              case BRIG_ATOMIC_MAX:
+                  return Enums::MO_ANRMAX;
+              case BRIG_ATOMIC_SUB:
+                  return Enums::MO_ANRSUB;
+              default:
+                fatal("Bad BrigAtomicOperation code %d\n", brigOp);
+            }
+        } else {
+            fatal("Bad BrigAtomicOpcode %d\n", brigOpCode);
+        }
+    }
+
+    const char*
+    atomicOpToString(BrigAtomicOperation brigOp)
+    {
+        switch (brigOp) {
+          case BRIG_ATOMIC_AND:
+            return "and";
+          case BRIG_ATOMIC_OR:
+            return "or";
+          case BRIG_ATOMIC_XOR:
+            return "xor";
+          case BRIG_ATOMIC_CAS:
+            return "cas";
+          case BRIG_ATOMIC_EXCH:
+            return "exch";
+          case BRIG_ATOMIC_ADD:
+            return "add";
+          case BRIG_ATOMIC_WRAPINC:
+            return "inc";
+          case BRIG_ATOMIC_WRAPDEC:
+            return "dec";
+          case BRIG_ATOMIC_MIN:
+            return "min";
+          case BRIG_ATOMIC_MAX:
+            return "max";
+          case BRIG_ATOMIC_SUB:
+            return "sub";
+          default:
+            return "unknown";
+        }
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh
new file mode 100644
index 000000000..d3ce76dee
--- /dev/null
+++ b/src/arch/hsail/insts/mem.hh
@@ -0,0 +1,1629 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_MEM_HH__
+#define __ARCH_HSAIL_INSTS_MEM_HH__
+
+#include "arch/hsail/insts/decl.hh"
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+
+namespace HsailISA
+{
+    class MemInst
+    {
+      public:
+        MemInst() : size(0), addr_operand(nullptr) { }
+
+        MemInst(Enums::MemType m_type)
+        {
+            if (m_type == Enums::M_U64 ||
+                m_type == Enums::M_S64 ||
+                m_type == Enums::M_F64) {
+                size = 8;
+            } else if (m_type == Enums::M_U32 ||
+                       m_type == Enums::M_S32 ||
+                       m_type == Enums::M_F32) {
+                size = 4;
+            } else if (m_type == Enums::M_U16 ||
+                       m_type == Enums::M_S16 ||
+                       m_type == Enums::M_F16) {
+                size = 2;
+            } else {
+                size = 1;
+            }
+
+            addr_operand = nullptr;
+        }
+
+        void
+        init_addr(AddrOperandBase *_addr_operand)
+        {
+            addr_operand = _addr_operand;
+        }
+
+      private:
+        int size;
+        AddrOperandBase *addr_operand;
+
+      public:
+        int getMemOperandSize() { return size; }
+        AddrOperandBase *getAddressOperand() { return addr_operand; }
+    };
+
+    template<typename DestOperandType, typename AddrOperandType>
+    class LdaInstBase : public HsailGPUStaticInst
+    {
+      public:
+        typename DestOperandType::DestOperand dest;
+        AddrOperandType addr;
+
+        LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                    const char *_opcode)
+           : HsailGPUStaticInst(obj, _opcode)
+        {
+            using namespace Brig;
+
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            addr.init(op_offs, obj);
+        }
+
+        int numSrcRegOperands() { return(this->addr.isVectorRegister()); }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isVectorRegister() :
+                   this->addr.isVectorRegister());
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isCondRegister() :
+                   this->addr.isCondRegister());
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isScalarRegister() :
+                   this->addr.isScalarRegister());
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex > 0)
+                return(this->addr.isVectorRegister());
+            return false;
+        }
+        bool isDstOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return(operandIndex == 0);
+        }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.opSize() :
+                   this->addr.opSize());
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.regIndex() :
+                   this->addr.regIndex());
+        }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister())
+                return 2;
+            return 1;
+        }
+    };
+
+    template<typename DestDataType, typename AddrOperandType>
+    class LdaInst :
+        public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>,
+        public MemInst
+    {
+      public:
+        void generateDisassembly();
+
+        LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                        const char *_opcode)
+            : LdaInstBase<typename DestDataType::OperandType,
+                          AddrOperandType>(ib, obj, _opcode)
+        {
+            init_addr(&this->addr);
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    template<typename DataType>
+    GPUStaticInst*
+    decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+        BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj);
+
+        if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+            return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas");
+        } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            // V2/V4 not allowed
+            switch (regDataType.regKind) {
+              case Brig::BRIG_REGISTER_KIND_SINGLE:
+                return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas");
+              case Brig::BRIG_REGISTER_KIND_DOUBLE:
+                return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas");
+              default:
+                fatal("Bad ldas register operand type %d\n", regDataType.type);
+            }
+        } else {
+            fatal("Bad ldas register operand kind %d\n", regDataType.kind);
+        }
+    }
+
+    template<typename MemOperandType, typename DestOperandType,
+             typename AddrOperandType>
+    class LdInstBase : public HsailGPUStaticInst
+    {
+      public:
+        Brig::BrigWidth8_t width;
+        typename DestOperandType::DestOperand dest;
+        AddrOperandType addr;
+
+        Brig::BrigSegment segment;
+        Brig::BrigMemoryOrder memoryOrder;
+        Brig::BrigMemoryScope memoryScope;
+        unsigned int equivClass;
+        bool isArgLoad()
+        {
+            return segment == Brig::BRIG_SEGMENT_KERNARG ||
+                   segment == Brig::BRIG_SEGMENT_ARG;
+        }
+        void
+        initLd(const Brig::BrigInstBase *ib, const BrigObject *obj,
+               const char *_opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstMem *ldst = (const BrigInstMem*)ib;
+
+            segment = (BrigSegment)ldst->segment;
+            memoryOrder = BRIG_MEMORY_ORDER_NONE;
+            memoryScope = BRIG_MEMORY_SCOPE_NONE;
+            equivClass = ldst->equivClass;
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_READ;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_READ;
+                break;
+
+              case BRIG_SEGMENT_PRIVATE:
+                o_type = Enums::OT_PRIVATE_READ;
+                break;
+
+              case BRIG_SEGMENT_READONLY:
+                o_type = Enums::OT_READONLY_READ;
+                break;
+
+              case BRIG_SEGMENT_SPILL:
+                o_type = Enums::OT_SPILL_READ;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_READ;
+                break;
+
+              case BRIG_SEGMENT_KERNARG:
+                o_type = Enums::OT_KERN_READ;
+                break;
+
+              case BRIG_SEGMENT_ARG:
+                o_type = Enums::OT_ARG;
+                break;
+
+              default:
+                panic("Ld: segment %d not supported\n", segment);
+            }
+
+            width = ldst->width;
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+            if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
+                dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            addr.init(op_offs, obj);
+        }
+
+        void
+        initAtomicLd(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                     const char *_opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+            segment = (BrigSegment)at->segment;
+            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+            memoryScope = (BrigMemoryScope)at->memoryScope;
+            equivClass = 0;
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_READ;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_READ;
+                break;
+
+              case BRIG_SEGMENT_PRIVATE:
+                o_type = Enums::OT_PRIVATE_READ;
+                break;
+
+              case BRIG_SEGMENT_READONLY:
+                o_type = Enums::OT_READONLY_READ;
+                break;
+
+              case BRIG_SEGMENT_SPILL:
+                o_type = Enums::OT_SPILL_READ;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_READ;
+                break;
+
+              case BRIG_SEGMENT_KERNARG:
+                o_type = Enums::OT_KERN_READ;
+                break;
+
+              case BRIG_SEGMENT_ARG:
+                o_type = Enums::OT_ARG;
+                break;
+
+              default:
+                panic("Ld: segment %d not supported\n", segment);
+            }
+
+            width = BRIG_WIDTH_1;
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+
+            if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
+                dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands,1);
+            addr.init(op_offs, obj);
+        }
+
+        LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                   const char *_opcode)
+           : HsailGPUStaticInst(obj, _opcode)
+        {
+            using namespace Brig;
+
+            if (ib->opcode == BRIG_OPCODE_LD) {
+                initLd(ib, obj, _opcode);
+            } else {
+                initAtomicLd(ib, obj, _opcode);
+            }
+        }
+
+        int numSrcRegOperands() { return(this->addr.isVectorRegister()); }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister())
+                return 2;
+            else
+                return 1;
+        }
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isVectorRegister() :
+                   this->addr.isVectorRegister());
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isCondRegister() :
+                   this->addr.isCondRegister());
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isScalarRegister() :
+                   this->addr.isScalarRegister());
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex > 0)
+                return(this->addr.isVectorRegister());
+            return false;
+        }
+        bool isDstOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return(operandIndex == 0);
+        }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.opSize() :
+                   this->addr.opSize());
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.regIndex() :
+                   this->addr.regIndex());
+        }
+    };
+
+    template<typename MemDataType, typename DestDataType,
+             typename AddrOperandType>
+    class LdInst :
+        public LdInstBase<typename MemDataType::CType,
+                          typename DestDataType::OperandType, AddrOperandType>,
+        public MemInst
+    {
+        typename DestDataType::OperandType::DestOperand dest_vect[4];
+        uint16_t num_dest_operands;
+        void generateDisassembly();
+
+      public:
+        LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+               const char *_opcode)
+            : LdInstBase<typename MemDataType::CType,
+                         typename DestDataType::OperandType,
+                         AddrOperandType>(ib, obj, _opcode),
+              MemInst(MemDataType::memType)
+        {
+            init_addr(&this->addr);
+
+            unsigned op_offs = obj->getOperandPtr(ib->operands,0);
+            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+
+            if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+                const Brig::BrigOperandOperandList *brigRegVecOp =
+                    (const Brig::BrigOperandOperandList*)brigOp;
+
+                num_dest_operands =
+                    *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
+
+                assert(num_dest_operands <= 4);
+            } else {
+                num_dest_operands = 1;
+            }
+
+            if (num_dest_operands > 1) {
+                assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+
+                for (int i = 0; i < num_dest_operands; ++i) {
+                    dest_vect[i].init_from_vect(op_offs, obj, i);
+                }
+            }
+        }
+
+        void
+        initiateAcc(GPUDynInstPtr gpuDynInst) override
+        {
+            typedef typename MemDataType::CType c0;
+
+            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+            if (num_dest_operands > 1) {
+                for (int i = 0; i < VSZ; ++i)
+                    if (gpuDynInst->exec_mask[i])
+                        gpuDynInst->statusVector.push_back(num_dest_operands);
+                    else
+                        gpuDynInst->statusVector.push_back(0);
+            }
+
+            for (int k = 0; k < num_dest_operands; ++k) {
+
+                c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+
+                for (int i = 0; i < VSZ; ++i) {
+                    if (gpuDynInst->exec_mask[i]) {
+                        Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
+
+                        if (isLocalMem()) {
+                            // load from shared memory
+                            *d = gpuDynInst->wavefront()->ldsChunk->
+                                read<c0>(vaddr);
+                        } else {
+                            Request *req = new Request(0, vaddr, sizeof(c0), 0,
+                                          gpuDynInst->computeUnit()->masterId(),
+                                          0, gpuDynInst->wfDynId, i);
+
+                            gpuDynInst->setRequestFlags(req);
+                            PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+                            pkt->dataStatic(d);
+
+                            if (gpuDynInst->computeUnit()->shader->
+                                separate_acquire_release &&
+                                gpuDynInst->memoryOrder ==
+                                Enums::MEMORY_ORDER_SC_ACQUIRE) {
+                                // if this load has acquire semantics,
+                                // set the response continuation function
+                                // to perform an Acquire request
+                                gpuDynInst->execContinuation =
+                                    &GPUStaticInst::execLdAcq;
+
+                                gpuDynInst->useContinuation = true;
+                            } else {
+                                // the request will be finished when
+                                // the load completes
+                                gpuDynInst->useContinuation = false;
+                            }
+                            // translation is performed in sendRequest()
+                            gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
+                                                                   i, pkt);
+                        }
+                    }
+                    ++d;
+                }
+            }
+
+            gpuDynInst->updateStats();
+        }
+
+      private:
+        void
+        execLdAcq(GPUDynInstPtr gpuDynInst) override
+        {
+            // after the load has complete and if the load has acquire
+            // semantics, issue an acquire request.
+            if (!isLocalMem()) {
+                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+                    && gpuDynInst->memoryOrder ==
+                    Enums::MEMORY_ORDER_SC_ACQUIRE) {
+                    gpuDynInst->statusBitVector = VectorMask(1);
+                    gpuDynInst->useContinuation = false;
+                    // create request
+                    Request *req = new Request(0, 0, 0, 0,
+                                  gpuDynInst->computeUnit()->masterId(),
+                                  0, gpuDynInst->wfDynId, -1);
+                    req->setFlags(Request::ACQUIRE);
+                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+                }
+            }
+        }
+
+      public:
+        bool
+        isLocalMem() const override
+        {
+            return this->segment == Brig::BRIG_SEGMENT_GROUP;
+        }
+
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.isVectorRegister());
+            if (num_dest_operands > 1) {
+                return dest_vect[operandIndex].isVectorRegister();
+            }
+            else if (num_dest_operands == 1) {
+                return LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.isVectorRegister();
+            }
+            return false;
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.isCondRegister());
+            if (num_dest_operands > 1)
+                return dest_vect[operandIndex].isCondRegister();
+            else if (num_dest_operands == 1)
+                return LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.isCondRegister();
+            return false;
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.isScalarRegister());
+            if (num_dest_operands > 1)
+                return dest_vect[operandIndex].isScalarRegister();
+            else if (num_dest_operands == 1)
+                return LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.isScalarRegister();
+            return false;
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.isVectorRegister());
+            return false;
+        }
+        bool isDstOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return false;
+            return true;
+        }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.opSize());
+            if (num_dest_operands > 1)
+                return(dest_vect[operandIndex].opSize());
+            else if (num_dest_operands == 1)
+                return(LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.opSize());
+            return 0;
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.regIndex());
+            if (num_dest_operands > 1)
+                return(dest_vect[operandIndex].regIndex());
+            else if (num_dest_operands == 1)
+                return(LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.regIndex());
+            return -1;
+        }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+                return(num_dest_operands+1);
+            else
+                return(num_dest_operands);
+        }
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    template<typename MemDT, typename DestDT>
+    GPUStaticInst*
+    decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        unsigned op_offs = obj->getOperandPtr(ib->operands,1);
+        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+            return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld");
+        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
+                   tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+            switch (tmp.regKind) {
+              case Brig::BRIG_REGISTER_KIND_SINGLE:
+                return new LdInst<MemDT, DestDT,
+                                  SRegAddrOperand>(ib, obj, "ld");
+              case Brig::BRIG_REGISTER_KIND_DOUBLE:
+                return new LdInst<MemDT, DestDT,
+                                  DRegAddrOperand>(ib, obj, "ld");
+              default:
+                fatal("Bad ld register operand type %d\n", tmp.regKind);
+            }
+        } else {
+            fatal("Bad ld register operand kind %d\n", tmp.kind);
+        }
+    }
+
+    template<typename MemDT>
+    GPUStaticInst*
+    decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        unsigned op_offs = obj->getOperandPtr(ib->operands,0);
+        BrigRegOperandInfo dest = findRegDataType(op_offs, obj);
+
+        assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
+               dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+        switch(dest.regKind) {
+          case Brig::BRIG_REGISTER_KIND_SINGLE:
+            switch (ib->type) {
+              case Brig::BRIG_TYPE_B8:
+              case Brig::BRIG_TYPE_B16:
+              case Brig::BRIG_TYPE_B32:
+                return decodeLd2<MemDT, B32>(ib, obj);
+              case Brig::BRIG_TYPE_U8:
+              case Brig::BRIG_TYPE_U16:
+              case Brig::BRIG_TYPE_U32:
+                return decodeLd2<MemDT, U32>(ib, obj);
+              case Brig::BRIG_TYPE_S8:
+              case Brig::BRIG_TYPE_S16:
+              case Brig::BRIG_TYPE_S32:
+                return decodeLd2<MemDT, S32>(ib, obj);
+              case Brig::BRIG_TYPE_F16:
+              case Brig::BRIG_TYPE_F32:
+                return decodeLd2<MemDT, U32>(ib, obj);
+              default:
+                fatal("Bad ld register operand type %d, %d\n",
+                      dest.regKind, ib->type);
+            };
+          case Brig::BRIG_REGISTER_KIND_DOUBLE:
+            switch (ib->type) {
+              case Brig::BRIG_TYPE_B64:
+                return decodeLd2<MemDT, B64>(ib, obj);
+              case Brig::BRIG_TYPE_U64:
+                return decodeLd2<MemDT, U64>(ib, obj);
+              case Brig::BRIG_TYPE_S64:
+                return decodeLd2<MemDT, S64>(ib, obj);
+              case Brig::BRIG_TYPE_F64:
+                return decodeLd2<MemDT, U64>(ib, obj);
+              default:
+                fatal("Bad ld register operand type %d, %d\n",
+                      dest.regKind, ib->type);
+            };
+          default:
+            fatal("Bad ld register operand type %d, %d\n", dest.regKind,
+                  ib->type);
+        }
+    }
+
+    template<typename MemDataType, typename SrcOperandType,
+             typename AddrOperandType>
+    class StInstBase : public HsailGPUStaticInst
+    {
+      public:
+        typename SrcOperandType::SrcOperand src;
+        AddrOperandType addr;
+
+        Brig::BrigSegment segment;
+        Brig::BrigMemoryScope memoryScope;
+        Brig::BrigMemoryOrder memoryOrder;
+        unsigned int equivClass;
+
+        void
+        initSt(const Brig::BrigInstBase *ib, const BrigObject *obj,
+               const char *_opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstMem *ldst = (const BrigInstMem*)ib;
+
+            segment = (BrigSegment)ldst->segment;
+            memoryOrder = BRIG_MEMORY_ORDER_NONE;
+            memoryScope = BRIG_MEMORY_SCOPE_NONE;
+            equivClass = ldst->equivClass;
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_WRITE;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_WRITE;
+                break;
+
+              case BRIG_SEGMENT_PRIVATE:
+                o_type = Enums::OT_PRIVATE_WRITE;
+                break;
+
+              case BRIG_SEGMENT_READONLY:
+                o_type = Enums::OT_READONLY_WRITE;
+                break;
+
+              case BRIG_SEGMENT_SPILL:
+                o_type = Enums::OT_SPILL_WRITE;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_WRITE;
+                break;
+
+              case BRIG_SEGMENT_ARG:
+                o_type = Enums::OT_ARG;
+                break;
+
+              default:
+                panic("St: segment %d not supported\n", segment);
+            }
+
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            const BrigOperand *baseOp = obj->getOperand(op_offs);
+
+            if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) ||
+                (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) {
+                src.init(op_offs, obj);
+            }
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            addr.init(op_offs, obj);
+        }
+
+        void
+        initAtomicSt(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                     const char *_opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+            segment = (BrigSegment)at->segment;
+            memoryScope = (BrigMemoryScope)at->memoryScope;
+            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+            equivClass = 0;
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_WRITE;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_WRITE;
+                break;
+
+              case BRIG_SEGMENT_PRIVATE:
+                o_type = Enums::OT_PRIVATE_WRITE;
+                break;
+
+              case BRIG_SEGMENT_READONLY:
+                o_type = Enums::OT_READONLY_WRITE;
+                break;
+
+              case BRIG_SEGMENT_SPILL:
+                o_type = Enums::OT_SPILL_WRITE;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_WRITE;
+                break;
+
+              case BRIG_SEGMENT_ARG:
+                o_type = Enums::OT_ARG;
+                break;
+
+              default:
+                panic("St: segment %d not supported\n", segment);
+            }
+
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            addr.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src.init(op_offs, obj);
+        }
+
+        StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                   const char *_opcode)
+           : HsailGPUStaticInst(obj, _opcode)
+        {
+            using namespace Brig;
+
+            if (ib->opcode == BRIG_OPCODE_ST) {
+                initSt(ib, obj, _opcode);
+            } else {
+                initAtomicSt(ib, obj, _opcode);
+            }
+        }
+
+        int numDstRegOperands() { return 0; }
+        int numSrcRegOperands()
+        {
+            return src.isVectorRegister() + this->addr.isVectorRegister();
+        }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+                return 2;
+            else
+                return 1;
+        }
+        bool isVectorRegister(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.isVectorRegister() :
+                   this->addr.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.isCondRegister() :
+                   this->addr.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.isScalarRegister() :
+                   this->addr.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return true;
+        }
+        bool isDstOperand(int operandIndex) { return false; }
+        int getOperandSize(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.opSize() : this->addr.opSize();
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.regIndex() : this->addr.regIndex();
+        }
+    };
+
+
+    template<typename MemDataType, typename SrcDataType,
+             typename AddrOperandType>
+    class StInst :
+        public StInstBase<MemDataType, typename SrcDataType::OperandType,
+                          AddrOperandType>,
+        public MemInst
+    {
+      public:
+        typename SrcDataType::OperandType::SrcOperand src_vect[4];
+        uint16_t num_src_operands;
+        void generateDisassembly();
+
+        StInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                        const char *_opcode, int srcIdx)
+            : StInstBase<MemDataType, typename SrcDataType::OperandType,
+                         AddrOperandType>(ib, obj, _opcode),
+              MemInst(SrcDataType::memType)
+        {
+            init_addr(&this->addr);
+
+            BrigRegOperandInfo rinfo;
+            unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx);
+            const Brig::BrigOperand *baseOp = obj->getOperand(op_offs);
+
+            if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+                const Brig::BrigOperandConstantBytes *op =
+                    (Brig::BrigOperandConstantBytes*)baseOp;
+
+                rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind,
+                                           Brig::BRIG_TYPE_NONE);
+            } else {
+                rinfo = findRegDataType(op_offs, obj);
+            }
+
+            if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+                const Brig::BrigOperandOperandList *brigRegVecOp =
+                    (const Brig::BrigOperandOperandList*)baseOp;
+
+                num_src_operands =
+                    *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
+
+                assert(num_src_operands <= 4);
+            } else {
+                num_src_operands = 1;
+            }
+
+            if (num_src_operands > 1) {
+                assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+
+                for (int i = 0; i < num_src_operands; ++i) {
+                    src_vect[i].init_from_vect(op_offs, obj, i);
+                }
+            }
+        }
+
+        void
+        initiateAcc(GPUDynInstPtr gpuDynInst) override
+        {
+            // before performing a store, check if this store has
+            // release semantics, and if so issue a release first
+            if (!isLocalMem()) {
+                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+                    && gpuDynInst->memoryOrder ==
+                    Enums::MEMORY_ORDER_SC_RELEASE) {
+
+                    gpuDynInst->statusBitVector = VectorMask(1);
+                    gpuDynInst->execContinuation = &GPUStaticInst::execSt;
+                    gpuDynInst->useContinuation = true;
+                    // create request
+                    Request *req = new Request(0, 0, 0, 0,
+                                  gpuDynInst->computeUnit()->masterId(),
+                                  0, gpuDynInst->wfDynId, -1);
+                    req->setFlags(Request::RELEASE);
+                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+
+                    return;
+                }
+            }
+
+            // if there is no release semantic, perform stores immediately
+            execSt(gpuDynInst);
+        }
+
+        bool
+        isLocalMem() const override
+        {
+            return this->segment == Brig::BRIG_SEGMENT_GROUP;
+        }
+
+      private:
+        // execSt may be called through a continuation
+        // if the store had release semantics. see comment for
+        // execSt in gpu_static_inst.hh
+        void
+        execSt(GPUDynInstPtr gpuDynInst) override
+        {
+            typedef typename MemDataType::CType c0;
+
+            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+            if (num_src_operands > 1) {
+                for (int i = 0; i < VSZ; ++i)
+                    if (gpuDynInst->exec_mask[i])
+                        gpuDynInst->statusVector.push_back(num_src_operands);
+                    else
+                        gpuDynInst->statusVector.push_back(0);
+            }
+
+            for (int k = 0; k < num_src_operands; ++k) {
+                c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+
+                for (int i = 0; i < VSZ; ++i) {
+                    if (gpuDynInst->exec_mask[i]) {
+                        Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
+
+                        if (isLocalMem()) {
+                            //store to shared memory
+                            gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr,
+                                                                         *d);
+                        } else {
+                            Request *req =
+                              new Request(0, vaddr, sizeof(c0), 0,
+                                          gpuDynInst->computeUnit()->masterId(),
+                                          0, gpuDynInst->wfDynId, i);
+
+                            gpuDynInst->setRequestFlags(req);
+                            PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+                            pkt->dataStatic<c0>(d);
+
+                            // translation is performed in sendRequest()
+                            // the request will be finished when the store completes
+                            gpuDynInst->useContinuation = false;
+                            gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
+                                                                   i, pkt);
+
+                        }
+                    }
+                    ++d;
+                }
+            }
+
+            gpuDynInst->updateStats();
+        }
+
+      public:
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.isVectorRegister();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].isVectorRegister();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.isVectorRegister();
+            return false;
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.isCondRegister();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].isCondRegister();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.isCondRegister();
+            return false;
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.isScalarRegister();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].isScalarRegister();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.isScalarRegister();
+            return false;
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return true;
+        }
+        bool isDstOperand(int operandIndex) { return false; }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.opSize();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].opSize();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.opSize();
+            return 0;
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.regIndex();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].regIndex();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.regIndex();
+            return -1;
+        }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+                return num_src_operands + 1;
+            else
+                return num_src_operands;
+        }
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    template<typename DataType, typename SrcDataType>
+    GPUStaticInst*
+    decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        int srcIdx = 0;
+        int destIdx = 1;
+        if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC ||
+            ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) {
+            srcIdx = 1;
+            destIdx = 0;
+        }
+        unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx);
+
+        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+            return new StInst<DataType, SrcDataType,
+                              NoRegAddrOperand>(ib, obj, "st", srcIdx);
+        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            // V2/V4 not allowed
+            switch (tmp.regKind) {
+              case Brig::BRIG_REGISTER_KIND_SINGLE:
+                return new StInst<DataType, SrcDataType,
+                                  SRegAddrOperand>(ib, obj, "st", srcIdx);
+              case Brig::BRIG_REGISTER_KIND_DOUBLE:
+                return new StInst<DataType, SrcDataType,
+                                  DRegAddrOperand>(ib, obj, "st", srcIdx);
+              default:
+                fatal("Bad st register operand type %d\n", tmp.type);
+            }
+        } else {
+            fatal("Bad st register operand kind %d\n", tmp.kind);
+        }
+    }
+
+    Enums::MemOpType brigAtomicToMemOpType(Brig::BrigOpcode brigOpCode,
+                                           Brig::BrigAtomicOperation brigOp);
+
+    template<typename OperandType, typename AddrOperandType, int NumSrcOperands,
+             bool HasDst>
+    class AtomicInstBase : public HsailGPUStaticInst
+    {
+      public:
+        typename OperandType::DestOperand dest;
+        typename OperandType::SrcOperand src[NumSrcOperands];
+        AddrOperandType addr;
+
+        Brig::BrigSegment segment;
+        Brig::BrigMemoryOrder memoryOrder;
+        Brig::BrigAtomicOperation atomicOperation;
+        Brig::BrigMemoryScope memoryScope;
+        Brig::BrigOpcode opcode;
+        Enums::MemOpType opType;
+
+        AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                       const char *_opcode)
+           : HsailGPUStaticInst(obj, _opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+            segment = (BrigSegment)at->segment;
+            memoryScope = (BrigMemoryScope)at->memoryScope;
+            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+            atomicOperation = (BrigAtomicOperation)at->atomicOperation;
+            opcode = (BrigOpcode)ib->opcode;
+            opType = brigAtomicToMemOpType(opcode, atomicOperation);
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_ATOMIC;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_ATOMIC;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_ATOMIC;
+                break;
+
+              default:
+                panic("Atomic: segment %d not supported\n", segment);
+            }
+
+            if (HasDst) {
+                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+                dest.init(op_offs, obj);
+
+                op_offs = obj->getOperandPtr(ib->operands, 1);
+                addr.init(op_offs, obj);
+
+                for (int i = 0; i < NumSrcOperands; ++i) {
+                    op_offs = obj->getOperandPtr(ib->operands, i + 2);
+                    src[i].init(op_offs, obj);
+                }
+            } else {
+
+                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+                addr.init(op_offs, obj);
+
+                for (int i = 0; i < NumSrcOperands; ++i) {
+                    op_offs = obj->getOperandPtr(ib->operands, i + 1);
+                    src[i].init(op_offs, obj);
+                }
+            }
+        }
+
+        int numSrcRegOperands()
+        {
+            int operands = 0;
+            for (int i = 0; i < NumSrcOperands; i++) {
+                if (src[i].isVectorRegister() == true) {
+                    operands++;
+                }
+            }
+            if (addr.isVectorRegister())
+                operands++;
+            return operands;
+        }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands()
+        {
+            if (addr.isVectorRegister())
+                return(NumSrcOperands + 2);
+            return(NumSrcOperands + 1);
+        }
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isVectorRegister();
+            else if (operandIndex == NumSrcOperands)
+                return(addr.isVectorRegister());
+            else
+                return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isCondRegister();
+            else if (operandIndex == NumSrcOperands)
+                return(addr.isCondRegister());
+            else
+                return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isScalarRegister();
+            else if (operandIndex == NumSrcOperands)
+                return(addr.isScalarRegister());
+            else
+                return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return true;
+            else if (operandIndex == NumSrcOperands)
+                return(addr.isVectorRegister());
+            else
+                return false;
+        }
+        bool isDstOperand(int operandIndex)
+        {
+            if (operandIndex <= NumSrcOperands)
+                return false;
+            else
+                return true;
+        }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return(src[operandIndex].opSize());
+            else if (operandIndex == NumSrcOperands)
+                return(addr.opSize());
+            else
+                return(dest.opSize());
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return(src[operandIndex].regIndex());
+            else if (operandIndex == NumSrcOperands)
+                return(addr.regIndex());
+            else
+                return(dest.regIndex());
+            return -1;
+        }
+    };
+
+    template<typename MemDataType, typename AddrOperandType, int NumSrcOperands,
+             bool HasDst>
+    class AtomicInst :
+        public AtomicInstBase<typename MemDataType::OperandType,
+                              AddrOperandType, NumSrcOperands, HasDst>,
+        public MemInst
+    {
+      public:
+        void generateDisassembly();
+
+        AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                   const char *_opcode)
+            : AtomicInstBase<typename MemDataType::OperandType, AddrOperandType,
+                             NumSrcOperands, HasDst>
+                (ib, obj, _opcode),
+              MemInst(MemDataType::memType)
+        {
+            init_addr(&this->addr);
+        }
+
+        void
+        initiateAcc(GPUDynInstPtr gpuDynInst) override
+        {
+            // before doing the RMW, check if this atomic has
+            // release semantics, and if so issue a release first
+            if (!isLocalMem()) {
+                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+                    && (gpuDynInst->memoryOrder ==
+                    Enums::MEMORY_ORDER_SC_RELEASE || gpuDynInst->memoryOrder ==
+                    Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE)) {
+
+                    gpuDynInst->statusBitVector = VectorMask(1);
+
+                    gpuDynInst->execContinuation = &GPUStaticInst::execAtomic;
+                    gpuDynInst->useContinuation = true;
+
+                    // create request
+                    Request *req = new Request(0, 0, 0, 0,
+                                  gpuDynInst->computeUnit()->masterId(),
+                                  0, gpuDynInst->wfDynId, -1);
+                    req->setFlags(Request::RELEASE);
+                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+
+                    return;
+                }
+            }
+
+            // if there is no release semantic, execute the RMW immediately
+            execAtomic(gpuDynInst);
+
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+
+        bool
+        isLocalMem() const override
+        {
+            return this->segment == Brig::BRIG_SEGMENT_GROUP;
+        }
+
+      private:
+        // execAtomic may be called through a continuation
+        // if the RMW had release semantics. see comment for
+        // execContinuation in gpu_dyn_inst.hh
+        void
+        execAtomic(GPUDynInstPtr gpuDynInst) override
+        {
+            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+            typedef typename MemDataType::CType c0;
+
+            c0 *d = &((c0*) gpuDynInst->d_data)[0];
+            c0 *e = &((c0*) gpuDynInst->a_data)[0];
+            c0 *f = &((c0*) gpuDynInst->x_data)[0];
+
+            for (int i = 0; i < VSZ; ++i) {
+                if (gpuDynInst->exec_mask[i]) {
+                    Addr vaddr = gpuDynInst->addr[i];
+
+                    if (isLocalMem()) {
+                        Wavefront *wavefront = gpuDynInst->wavefront();
+                        *d = wavefront->ldsChunk->read<c0>(vaddr);
+
+                        switch (this->opType) {
+                          case Enums::MO_AADD:
+                          case Enums::MO_ANRADD:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) + (*e));
+                            break;
+                          case Enums::MO_ASUB:
+                          case Enums::MO_ANRSUB:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) - (*e));
+                            break;
+                          case Enums::MO_AMAX:
+                          case Enums::MO_ANRMAX:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            std::max(wavefront->ldsChunk->read<c0>(vaddr),
+                            (*e)));
+                            break;
+                          case Enums::MO_AMIN:
+                          case Enums::MO_ANRMIN:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            std::min(wavefront->ldsChunk->read<c0>(vaddr),
+                            (*e)));
+                            break;
+                          case Enums::MO_AAND:
+                          case Enums::MO_ANRAND:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) & (*e));
+                            break;
+                          case Enums::MO_AOR:
+                          case Enums::MO_ANROR:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) | (*e));
+                            break;
+                          case Enums::MO_AXOR:
+                          case Enums::MO_ANRXOR:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) ^ (*e));
+                            break;
+                          case Enums::MO_AINC:
+                          case Enums::MO_ANRINC:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) + 1);
+                            break;
+                          case Enums::MO_ADEC:
+                          case Enums::MO_ANRDEC:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) - 1);
+                            break;
+                          case Enums::MO_AEXCH:
+                          case Enums::MO_ANREXCH:
+                            wavefront->ldsChunk->write<c0>(vaddr, (*e));
+                            break;
+                          case Enums::MO_ACAS:
+                          case Enums::MO_ANRCAS:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ?
+                            (*f) : wavefront->ldsChunk->read<c0>(vaddr));
+                            break;
+                          default:
+                            fatal("Unrecognized or invalid HSAIL atomic op "
+                                  "type.\n");
+                            break;
+                        }
+                    } else {
+                        Request *req =
+                            new Request(0, vaddr, sizeof(c0), 0,
+                                        gpuDynInst->computeUnit()->masterId(),
+                                        0, gpuDynInst->wfDynId, i,
+                                        gpuDynInst->makeAtomicOpFunctor<c0>(e,
+                                        f, this->opType));
+
+                        gpuDynInst->setRequestFlags(req);
+                        PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
+                        pkt->dataStatic(d);
+
+                        if (gpuDynInst->computeUnit()->shader->
+                            separate_acquire_release &&
+                            (gpuDynInst->memoryOrder ==
+                             Enums::MEMORY_ORDER_SC_ACQUIRE)) {
+                            // if this atomic has acquire semantics,
+                            // schedule the continuation to perform an
+                            // acquire after the RMW completes
+                            gpuDynInst->execContinuation =
+                                &GPUStaticInst::execAtomicAcq;
+
+                            gpuDynInst->useContinuation = true;
+                        } else {
+                            // the request will be finished when the RMW completes
+                            gpuDynInst->useContinuation = false;
+                        }
+                        // translation is performed in sendRequest()
+                        gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i,
+                                                               pkt);
+                    }
+                }
+
+                ++d;
+                ++e;
+                ++f;
+            }
+
+            gpuDynInst->updateStats();
+        }
+
+        // execAtomicACq will always be called through a continuation.
+        // see comment for execContinuation in gpu_dyn_inst.hh
+        void
+        execAtomicAcq(GPUDynInstPtr gpuDynInst) override
+        {
+            // after performing the RMW, check to see if this instruction
+            // has acquire semantics, and if so, issue an acquire
+            if (!isLocalMem()) {
+                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+                     && gpuDynInst->memoryOrder ==
+                     Enums::MEMORY_ORDER_SC_ACQUIRE) {
+                    gpuDynInst->statusBitVector = VectorMask(1);
+
+                    // the request will be finished when
+                    // the acquire completes
+                    gpuDynInst->useContinuation = false;
+                    // create request
+                    Request *req = new Request(0, 0, 0, 0,
+                                  gpuDynInst->computeUnit()->masterId(),
+                                  0, gpuDynInst->wfDynId, -1);
+                    req->setFlags(Request::ACQUIRE);
+                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+                }
+            }
+        }
+    };
+
+    template<typename DataType, typename AddrOperandType, int NumSrcOperands>
+    GPUStaticInst*
+    constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+
+        if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) {
+            return decodeLd<DataType>(ib, obj);
+        } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) {
+            switch (ib->type) {
+              case Brig::BRIG_TYPE_B8:
+                return decodeSt<S8,S8>(ib, obj);
+              case Brig::BRIG_TYPE_B16:
+                return decodeSt<S8,S16>(ib, obj);
+              case Brig::BRIG_TYPE_B32:
+                return decodeSt<S8,S32>(ib, obj);
+              case Brig::BRIG_TYPE_B64:
+                return decodeSt<S8,S64>(ib, obj);
+              default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type);
+            }
+        } else {
+            if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET)
+                return new AtomicInst<DataType, AddrOperandType,
+                    NumSrcOperands, false>(ib, obj, "atomicnoret");
+            else
+                return new AtomicInst<DataType, AddrOperandType,
+                    NumSrcOperands, true>(ib, obj, "atomic");
+        }
+    }
+
+    template<typename DataType, int NumSrcOperands>
+    GPUStaticInst*
+    decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        unsigned addrIndex = (Brig::BrigOpcode)ib->opcode ==
+            Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1;
+
+        unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex);
+
+        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+            return constructAtomic<DataType, NoRegAddrOperand,
+                                   NumSrcOperands>(ib, obj);
+        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            // V2/V4 not allowed
+            switch (tmp.regKind) {
+              case Brig::BRIG_REGISTER_KIND_SINGLE:
+                  return constructAtomic<DataType, SRegAddrOperand,
+                                         NumSrcOperands>(ib, obj);
+              case Brig::BRIG_REGISTER_KIND_DOUBLE:
+                return constructAtomic<DataType, DRegAddrOperand,
+                                       NumSrcOperands>(ib, obj);
+              default:
+                fatal("Bad atomic register operand type %d\n", tmp.type);
+            }
+        } else {
+            fatal("Bad atomic register operand kind %d\n", tmp.kind);
+        }
+    }
+
+
+    template<typename DataType>
+    GPUStaticInst*
+    decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+
+        if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
+            return decodeAtomicHelper<DataType, 2>(ib, obj);
+        } else {
+            return decodeAtomicHelper<DataType, 1>(ib, obj);
+        }
+    }
+
+    template<typename DataType>
+    GPUStaticInst*
+    decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+        if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
+            return decodeAtomicHelper<DataType, 2>(ib, obj);
+        } else {
+            return decodeAtomicHelper<DataType, 1>(ib, obj);
+        }
+    }
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_MEM_HH__
diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh
new file mode 100644
index 000000000..94f0cd6aa
--- /dev/null
+++ b/src/arch/hsail/insts/mem_impl.hh
@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/generic_types.hh"
+#include "gpu-compute/hsail_code.hh"
+
+// defined in code.cc, but not worth sucking in all of code.h for this
+// at this point
+extern const char *segmentNames[];
+
+namespace HsailISA
+{
+    template<typename DestDataType, typename AddrRegOperandType>
+    void
+    LdaInst<DestDataType, AddrRegOperandType>::generateDisassembly()
+    {
+        this->disassembly = csprintf("%s_%s %s,%s", this->opcode,
+                                     DestDataType::label,
+                                     this->dest.disassemble(),
+                                     this->addr.disassemble());
+    }
+
+    template<typename DestDataType, typename AddrRegOperandType>
+    void
+    LdaInst<DestDataType, AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        typedef typename DestDataType::CType CType M5_VAR_USED;
+        const VectorMask &mask = w->get_pred();
+        uint64_t addr_vec[VSZ];
+        this->addr.calcVector(w, addr_vec);
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                this->dest.set(w, lane, addr_vec[lane]);
+            }
+        }
+    }
+
+    template<typename MemDataType, typename DestDataType,
+             typename AddrRegOperandType>
+    void
+    LdInst<MemDataType, DestDataType, AddrRegOperandType>::generateDisassembly()
+    {
+        switch (num_dest_operands) {
+          case 1:
+            this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
+                                         segmentNames[this->segment],
+                                         MemDataType::label,
+                                         this->dest.disassemble(),
+                                         this->addr.disassemble());
+            break;
+          case 2:
+            this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
+                                         segmentNames[this->segment],
+                                         MemDataType::label,
+                                         this->dest_vect[0].disassemble(),
+                                         this->dest_vect[1].disassemble(),
+                                         this->addr.disassemble());
+            break;
+          case 4:
+            this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
+                                         this->opcode,
+                                         segmentNames[this->segment],
+                                         MemDataType::label,
+                                         this->dest_vect[0].disassemble(),
+                                         this->dest_vect[1].disassemble(),
+                                         this->dest_vect[2].disassemble(),
+                                         this->dest_vect[3].disassemble(),
+                                         this->addr.disassemble());
+            break;
+          default:
+            fatal("Bad ld register dest operand, num vector operands: %d \n",
+                  num_dest_operands);
+            break;
+        }
+    }
+
+    static Addr
+    calcPrivAddr(Addr addr, Wavefront *w, int lane, GPUStaticInst *i)
+    {
+        // what is the size of the object we are accessing??
+        // NOTE: the compiler doesn't generate enough information
+        // to do this yet..have to just line up all the private
+        // work-item spaces back to back for now
+        /*
+        StorageElement* se =
+            i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
+        assert(se);
+
+        return w->wfSlotId * w->privSizePerItem * VSZ +
+            se->offset * VSZ +
+            lane * se->size;
+        */
+
+        // addressing strategy: interleave the private spaces of
+        // work-items in a wave-front on 8 byte granularity.
+        // this won't be perfect coalescing like the spill space
+        // strategy, but it's better than nothing. The spill space
+        // strategy won't work with private because the same address
+        // may be accessed by different sized loads/stores.
+
+        // Note: I'm assuming that the largest load/store to private
+        // is 8 bytes. If it is larger, the stride will have to increase
+
+        Addr addr_div8 = addr / 8;
+        Addr addr_mod8 = addr % 8;
+
+        Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase;
+
+        assert(ret < w->privBase + (w->privSizePerItem * VSZ));
+
+        return ret;
+    }
+
+    template<typename MemDataType, typename DestDataType,
+             typename AddrRegOperandType>
+    void
+    LdInst<MemDataType, DestDataType,
+           AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        typedef typename MemDataType::CType MemCType;
+        const VectorMask &mask = w->get_pred();
+
+        // Kernarg references are handled uniquely for now (no Memory Request
+        // is used), so special-case them up front.  Someday we should
+        // make this more realistic, at which we should get rid of this
+        // block and fold this case into the switch below.
+        if (this->segment == Brig::BRIG_SEGMENT_KERNARG) {
+            MemCType val;
+
+            // I assume no vector ld for kernargs
+            assert(num_dest_operands == 1);
+
+            // assuming for the moment that we'll never do register
+            // offsets into kernarg space... just to make life simpler
+            uint64_t address = this->addr.calcUniform();
+
+            val = *(MemCType*)&w->kernelArgs[address];
+
+            DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
+
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    this->dest.set(w, lane, val);
+                }
+            }
+
+            return;
+        } else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
+            uint64_t address = this->addr.calcUniform();
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    MemCType val = w->readCallArgMem<MemCType>(lane, address);
+
+                    DPRINTF(HSAIL, "ld_arg [%d] -> %llu\n", address,
+                            (unsigned long long)val);
+
+                    this->dest.set(w, lane, val);
+                }
+            }
+
+            return;
+        }
+
+        GPUDynInstPtr m = gpuDynInst;
+
+        this->addr.calcVector(w, m->addr);
+
+        m->m_op = Enums::MO_LD;
+        m->m_type = MemDataType::memType;
+        m->v_type = DestDataType::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = this->equivClass;
+        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+        m->scope = getGenericMemoryScope(this->memoryScope);
+
+        if (num_dest_operands == 1) {
+            m->dst_reg = this->dest.regIndex();
+            m->n_reg = 1;
+        } else {
+            m->n_reg = num_dest_operands;
+            for (int i = 0; i < num_dest_operands; ++i) {
+                m->dst_reg_vec[i] = this->dest_vect[i].regIndex();
+            }
+        }
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->kern_id = w->kern_id;
+        m->cu_id = w->computeUnit->cu_id;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        switch (this->segment) {
+          case Brig::BRIG_SEGMENT_GLOBAL:
+            m->s_type = SEG_GLOBAL;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+
+            // this is a complete hack to get around a compiler bug
+            // (the compiler currently generates global access for private
+            //  addresses (starting from 0). We need to add the private offset)
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (m->addr[lane] < w->privSizePerItem) {
+                    if (mask[lane]) {
+                        // what is the size of the object we are accessing?
+                        // find base for for this wavefront
+
+                        // calcPrivAddr will fail if accesses are unaligned
+                        assert(!((sizeof(MemCType) - 1) & m->addr[lane]));
+
+                        Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
+                                                     this);
+
+                        m->addr[lane] = privAddr;
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_SPILL:
+            assert(num_dest_operands == 1);
+            m->s_type = SEG_SPILL;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+            {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    //  note: this calculation will NOT WORK if the compiler
+                    //  ever generates loads/stores to the same address with
+                    //  different widths (e.g., a ld_u32 addr and a ld_u16 addr)
+                    if (mask[lane]) {
+                        assert(m->addr[lane] < w->spillSizePerItem);
+
+                        m->addr[lane] = m->addr[lane] * w->spillWidth +
+                                        lane * sizeof(MemCType) + w->spillBase;
+
+                        w->last_addr[lane] = m->addr[lane];
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_GROUP:
+            m->s_type = SEG_SHARED;
+            m->pipeId = LDSMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(24));
+            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+            w->outstanding_reqs_rd_lm++;
+            w->rd_lm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_READONLY:
+            m->s_type = SEG_READONLY;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
+                    m->addr[lane] += w->roBase;
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_PRIVATE:
+            m->s_type = SEG_PRIVATE;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+            {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    if (mask[lane]) {
+                        assert(m->addr[lane] < w->privSizePerItem);
+
+                        m->addr[lane] = m->addr[lane] +
+                            lane * sizeof(MemCType) + w->privBase;
+                    }
+                }
+            }
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          default:
+            fatal("Load to unsupported segment %d %llxe\n", this->segment,
+                  m->addr[0]);
+        }
+
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    template<typename OperationType, typename SrcDataType,
+             typename AddrRegOperandType>
+    void
+    StInst<OperationType, SrcDataType,
+           AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        typedef typename OperationType::CType CType;
+
+        const VectorMask &mask = w->get_pred();
+
+        // arg references are handled uniquely for now (no Memory Request
+        // is used), so special-case them up front.  Someday we should
+        // make this more realistic, at which we should get rid of this
+        // block and fold this case into the switch below.
+        if (this->segment == Brig::BRIG_SEGMENT_ARG) {
+            uint64_t address = this->addr.calcUniform();
+
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    CType data = this->src.template get<CType>(w, lane);
+                    DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
+                    w->writeCallArgMem<CType>(lane, address, data);
+                }
+            }
+
+            return;
+        }
+
+        GPUDynInstPtr m = gpuDynInst;
+
+        m->exec_mask = w->execMask();
+
+        this->addr.calcVector(w, m->addr);
+
+        if (num_src_operands == 1) {
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    ((CType*)m->d_data)[lane] =
+                        this->src.template get<CType>(w, lane);
+                }
+            }
+        } else {
+            for (int k= 0; k < num_src_operands; ++k) {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    if (mask[lane]) {
+                        ((CType*)m->d_data)[k * VSZ + lane] =
+                            this->src_vect[k].template get<CType>(w, lane);
+                    }
+                }
+            }
+        }
+
+        m->m_op = Enums::MO_ST;
+        m->m_type = OperationType::memType;
+        m->v_type = OperationType::vgprType;
+
+        m->statusBitVector = 0;
+        m->equiv = this->equivClass;
+
+        if (num_src_operands == 1) {
+            m->n_reg = 1;
+        } else {
+            m->n_reg = num_src_operands;
+        }
+
+        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+        m->scope = getGenericMemoryScope(this->memoryScope);
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->kern_id = w->kern_id;
+        m->cu_id = w->computeUnit->cu_id;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        switch (this->segment) {
+          case Brig::BRIG_SEGMENT_GLOBAL:
+            m->s_type = SEG_GLOBAL;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+
+            // this is a complete hack to get around a compiler bug
+            // (the compiler currently generates global access for private
+            //  addresses (starting from 0). We need to add the private offset)
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    if (m->addr[lane] < w->privSizePerItem) {
+
+                        // calcPrivAddr will fail if accesses are unaligned
+                        assert(!((sizeof(CType)-1) & m->addr[lane]));
+
+                        Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
+                                                     this);
+
+                        m->addr[lane] = privAddr;
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_wr_gm++;
+            w->wr_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_SPILL:
+            assert(num_src_operands == 1);
+            m->s_type = SEG_SPILL;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+            {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    if (mask[lane]) {
+                        assert(m->addr[lane] < w->spillSizePerItem);
+
+                        m->addr[lane] = m->addr[lane] * w->spillWidth +
+                                        lane * sizeof(CType) + w->spillBase;
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_wr_gm++;
+            w->wr_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_GROUP:
+            m->s_type = SEG_SHARED;
+            m->pipeId = LDSMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(24));
+            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+            w->outstanding_reqs_wr_lm++;
+            w->wr_lm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_PRIVATE:
+            m->s_type = SEG_PRIVATE;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+            {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    if (mask[lane]) {
+                        assert(m->addr[lane] < w->privSizePerItem);
+                        m->addr[lane] = m->addr[lane] + lane *
+                            sizeof(CType)+w->privBase;
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_wr_gm++;
+            w->wr_gm_reqs_in_pipe--;
+            break;
+
+          default:
+            fatal("Store to unsupported segment %d\n", this->segment);
+        }
+
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    template<typename OperationType, typename SrcDataType,
+             typename AddrRegOperandType>
+    void
+    StInst<OperationType, SrcDataType,
+           AddrRegOperandType>::generateDisassembly()
+    {
+        switch (num_src_operands) {
+          case 1:
+            this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
+                                         segmentNames[this->segment],
+                                         OperationType::label,
+                                         this->src.disassemble(),
+                                         this->addr.disassemble());
+            break;
+          case 2:
+            this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
+                                         segmentNames[this->segment],
+                                         OperationType::label,
+                                         this->src_vect[0].disassemble(),
+                                         this->src_vect[1].disassemble(),
+                                         this->addr.disassemble());
+            break;
+          case 4:
+            this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
+                                         this->opcode,
+                                         segmentNames[this->segment],
+                                         OperationType::label,
+                                         this->src_vect[0].disassemble(),
+                                         this->src_vect[1].disassemble(),
+                                         this->src_vect[2].disassemble(),
+                                         this->src_vect[3].disassemble(),
+                                         this->addr.disassemble());
+            break;
+          default: fatal("Bad ld register src operand, num vector operands: "
+                         "%d \n", num_src_operands);
+            break;
+        }
+    }
+
+    template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
+             bool HasDst>
+    void
+    AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
+        HasDst>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        typedef typename DataType::CType CType;
+
+        Wavefront *w = gpuDynInst->wavefront();
+
+        GPUDynInstPtr m = gpuDynInst;
+
+        this->addr.calcVector(w, m->addr);
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            ((CType *)m->a_data)[lane] =
+                this->src[0].template get<CType>(w, lane);
+        }
+
+        // load second source operand for CAS
+        if (NumSrcOperands > 1) {
+            for (int lane = 0; lane < VSZ; ++lane) {
+                ((CType*)m->x_data)[lane] =
+                    this->src[1].template get<CType>(w, lane);
+            }
+        }
+
+        assert(NumSrcOperands <= 2);
+
+        m->m_op = this->opType;
+        m->m_type = DataType::memType;
+        m->v_type = DataType::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = 0;  // atomics don't have an equivalence class operand
+        m->n_reg = 1;
+        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+        m->scope = getGenericMemoryScope(this->memoryScope);
+
+        if (HasDst) {
+            m->dst_reg = this->dest.regIndex();
+        }
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->kern_id = w->kern_id;
+        m->cu_id = w->computeUnit->cu_id;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        switch (this->segment) {
+          case Brig::BRIG_SEGMENT_GLOBAL:
+            m->s_type = SEG_GLOBAL;
+            m->latency.set(w->computeUnit->shader->ticks(64));
+            m->pipeId = GLBMEM_PIPE;
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_wr_gm++;
+            w->wr_gm_reqs_in_pipe--;
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_GROUP:
+            m->s_type = SEG_SHARED;
+            m->pipeId = LDSMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(24));
+            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+            w->outstanding_reqs_wr_lm++;
+            w->wr_lm_reqs_in_pipe--;
+            w->outstanding_reqs_rd_lm++;
+            w->rd_lm_reqs_in_pipe--;
+            break;
+
+          default:
+            fatal("Atomic op to unsupported segment %d\n",
+                  this->segment);
+        }
+
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    const char* atomicOpToString(Brig::BrigAtomicOperation atomicOp);
+
+    template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
+             bool HasDst>
+    void
+    AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
+               HasDst>::generateDisassembly()
+    {
+        if (HasDst) {
+            this->disassembly =
+                csprintf("%s_%s_%s_%s %s,%s", this->opcode,
+                         atomicOpToString(this->atomicOperation),
+                         segmentNames[this->segment],
+                         DataType::label, this->dest.disassemble(),
+                         this->addr.disassemble());
+        } else {
+            this->disassembly =
+                csprintf("%s_%s_%s_%s %s", this->opcode,
+                         atomicOpToString(this->atomicOperation),
+                         segmentNames[this->segment],
+                         DataType::label, this->addr.disassemble());
+        }
+
+        for (int i = 0; i < NumSrcOperands; ++i) {
+            this->disassembly += ",";
+            this->disassembly += this->src[i].disassemble();
+        }
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc
new file mode 100644
index 000000000..9506a80ab
--- /dev/null
+++ b/src/arch/hsail/insts/pseudo_inst.cc
@@ -0,0 +1,787 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Marc Orr
+ */
+
+#include <csignal>
+
+#include "arch/hsail/insts/decl.hh"
+#include "arch/hsail/insts/mem.hh"
+
+namespace HsailISA
+{
+    // Pseudo (or magic) instructions are overloaded on the hsail call
+    // instruction, because of its flexible parameter signature.
+
+    // To add a new magic instruction:
+    // 1. Add an entry to the enum.
+    // 2. Implement it in the switch statement below (Call::exec).
+    // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h,
+    //    so its easy to call from an OpenCL kernel.
+
+    // This enum should be identical to the enum in
+    // hsa/hsail-gpu-compute/util/magicinst.h
+    enum
+    {
+        MAGIC_PRINT_WF_32 = 0,
+        MAGIC_PRINT_WF_64,
+        MAGIC_PRINT_LANE,
+        MAGIC_PRINT_LANE_64,
+        MAGIC_PRINT_WF_FLOAT,
+        MAGIC_SIM_BREAK,
+        MAGIC_PREF_SUM,
+        MAGIC_REDUCTION,
+        MAGIC_MASKLANE_LOWER,
+        MAGIC_MASKLANE_UPPER,
+        MAGIC_JOIN_WF_BAR,
+        MAGIC_WAIT_WF_BAR,
+        MAGIC_PANIC,
+        MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG,
+        MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG,
+        MAGIC_LOAD_GLOBAL_U32_REG,
+        MAGIC_XACT_CAS_LD,
+        MAGIC_MOST_SIG_THD,
+        MAGIC_MOST_SIG_BROADCAST,
+        MAGIC_PRINT_WFID_32,
+        MAGIC_PRINT_WFID_64
+    };
+
+    void
+    Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst)
+    {
+        const VectorMask &mask = w->get_pred();
+
+        int op = 0;
+        bool got_op = false;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val0 = src1.get<int>(w, lane, 0);
+                if (got_op) {
+                    if (src_val0 != op) {
+                        fatal("Multiple magic instructions per PC not "
+                              "supported\n");
+                    }
+                } else {
+                    op = src_val0;
+                    got_op = true;
+                }
+            }
+        }
+
+        switch(op) {
+          case MAGIC_PRINT_WF_32:
+            MagicPrintWF32(w);
+            break;
+          case MAGIC_PRINT_WF_64:
+            MagicPrintWF64(w);
+            break;
+          case MAGIC_PRINT_LANE:
+            MagicPrintLane(w);
+            break;
+          case MAGIC_PRINT_LANE_64:
+            MagicPrintLane64(w);
+            break;
+          case MAGIC_PRINT_WF_FLOAT:
+            MagicPrintWFFloat(w);
+            break;
+          case MAGIC_SIM_BREAK:
+            MagicSimBreak(w);
+            break;
+          case MAGIC_PREF_SUM:
+            MagicPrefixSum(w);
+            break;
+          case MAGIC_REDUCTION:
+            MagicReduction(w);
+            break;
+          case MAGIC_MASKLANE_LOWER:
+            MagicMaskLower(w);
+            break;
+          case MAGIC_MASKLANE_UPPER:
+            MagicMaskUpper(w);
+            break;
+          case MAGIC_JOIN_WF_BAR:
+            MagicJoinWFBar(w);
+            break;
+          case MAGIC_WAIT_WF_BAR:
+            MagicWaitWFBar(w);
+            break;
+          case MAGIC_PANIC:
+            MagicPanic(w);
+            break;
+
+          // atomic instructions
+          case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG:
+            MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst);
+            break;
+
+          case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG:
+            MagicAtomicNRAddGroupU32Reg(w, gpuDynInst);
+            break;
+
+          case MAGIC_LOAD_GLOBAL_U32_REG:
+            MagicLoadGlobalU32Reg(w, gpuDynInst);
+            break;
+
+          case MAGIC_XACT_CAS_LD:
+            MagicXactCasLd(w);
+            break;
+
+          case MAGIC_MOST_SIG_THD:
+            MagicMostSigThread(w);
+            break;
+
+          case MAGIC_MOST_SIG_BROADCAST:
+            MagicMostSigBroadcast(w);
+            break;
+
+          case MAGIC_PRINT_WFID_32:
+            MagicPrintWF32ID(w);
+            break;
+
+          case MAGIC_PRINT_WFID_64:
+            MagicPrintWFID64(w);
+            break;
+
+          default: fatal("unrecognized magic instruction: %d\n", op);
+        }
+    }
+
+    void
+    Call::MagicPrintLane(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+                if (src_val2) {
+                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
+                             disassemble(), w->computeUnit->cu_id, w->simdId,
+                             w->wfSlotId, lane, src_val1);
+                } else {
+                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
+                             disassemble(), w->computeUnit->cu_id, w->simdId,
+                             w->wfSlotId, lane, src_val1);
+                }
+            }
+        }
+    #endif
+    }
+
+    void
+    Call::MagicPrintLane64(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+                if (src_val2) {
+                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
+                             disassemble(), w->computeUnit->cu_id, w->simdId,
+                             w->wfSlotId, lane, src_val1);
+                } else {
+                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
+                             disassemble(), w->computeUnit->cu_id, w->simdId,
+                             w->wfSlotId, lane, src_val1);
+                }
+            }
+        }
+    #endif
+    }
+
+    void
+    Call::MagicPrintWF32(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 7)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+
+                if (src_val2) {
+                    res_str += csprintf("%08x", src_val1);
+                } else {
+                    res_str += csprintf("%08d", src_val1);
+                }
+            } else {
+                res_str += csprintf("xxxxxxxx");
+            }
+
+            if ((lane & 7) == 7) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        DPRINTFN(res_str.c_str());
+    #endif
+    }
+
+    void
+    Call::MagicPrintWF32ID(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        int src_val3 = -1;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 7)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+                src_val3 = src1.get<int>(w, lane, 3);
+
+                if (src_val2) {
+                    res_str += csprintf("%08x", src_val1);
+                } else {
+                    res_str += csprintf("%08d", src_val1);
+                }
+            } else {
+                res_str += csprintf("xxxxxxxx");
+            }
+
+            if ((lane & 7) == 7) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        if (w->wfDynId == src_val3) {
+            DPRINTFN(res_str.c_str());
+        }
+    #endif
+    }
+
+    void
+    Call::MagicPrintWF64(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 3)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+
+                if (src_val2) {
+                    res_str += csprintf("%016x", src_val1);
+                } else {
+                    res_str += csprintf("%016d", src_val1);
+                }
+            } else {
+                res_str += csprintf("xxxxxxxxxxxxxxxx");
+            }
+
+            if ((lane & 3) == 3) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        DPRINTFN(res_str.c_str());
+    #endif
+    }
+
+    void
+    Call::MagicPrintWFID64(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        int src_val3 = -1;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 3)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+                src_val3 = src1.get<int>(w, lane, 3);
+
+                if (src_val2) {
+                    res_str += csprintf("%016x", src_val1);
+                } else {
+                    res_str += csprintf("%016d", src_val1);
+                }
+            } else {
+                res_str += csprintf("xxxxxxxxxxxxxxxx");
+            }
+
+            if ((lane & 3) == 3) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        if (w->wfDynId == src_val3) {
+            DPRINTFN(res_str.c_str());
+        }
+    #endif
+    }
+
+    void
+    Call::MagicPrintWFFloat(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 7)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                float src_val1 = src1.get<float>(w, lane, 1);
+                res_str += csprintf("%08f", src_val1);
+            } else {
+                res_str += csprintf("xxxxxxxx");
+            }
+
+            if ((lane & 7) == 7) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        DPRINTFN(res_str.c_str());
+    #endif
+    }
+
+    // raises a signal that GDB will catch
+    // when done with the break, type "signal 0" in gdb to continue
+    void
+    Call::MagicSimBreak(Wavefront *w)
+    {
+        std::string res_str;
+        // print out state for this wavefront and then break
+        res_str = csprintf("Breakpoint encountered for wavefront %i\n",
+                           w->wfSlotId);
+
+        res_str += csprintf("  Kern ID: %i\n", w->kern_id);
+        res_str += csprintf("  Phase ID: %i\n", w->simdId);
+        res_str += csprintf("  Executing on CU #%i\n", w->computeUnit->cu_id);
+        res_str += csprintf("  Exec mask: ");
+
+        for (int i = VSZ - 1; i >= 0; --i) {
+            if (w->execMask(i))
+                res_str += "1";
+            else
+                res_str += "0";
+
+            if ((i & 7) == 7)
+                res_str += " ";
+        }
+
+        res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong());
+
+        res_str += "\nHelpful debugging hints:\n";
+        res_str += "   Check out w->s_reg / w->d_reg for register state\n";
+
+        res_str += "\n\n";
+        DPRINTFN(res_str.c_str());
+        fflush(stdout);
+
+        raise(SIGTRAP);
+    }
+
+    void
+    Call::MagicPrefixSum(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                dest.set<int>(w, lane, res);
+                res += src_val1;
+            }
+        }
+    }
+
+    void
+    Call::MagicReduction(Wavefront *w)
+    {
+        // reduction magic instruction
+        //   The reduction instruction takes up to 64 inputs (one from
+        //   each thread in a WF) and sums them. It returns the sum to
+        //   each thread in the WF.
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                res += src_val1;
+            }
+        }
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                dest.set<int>(w, lane, res);
+            }
+        }
+    }
+
+    void
+    Call::MagicMaskLower(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+
+                if (src_val1) {
+                    if (lane < (VSZ/2)) {
+                        res = res | ((uint32_t)(1) << lane);
+                    }
+                }
+            }
+        }
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                dest.set<int>(w, lane, res);
+            }
+        }
+    }
+
+    void
+    Call::MagicMaskUpper(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+
+                if (src_val1) {
+                    if (lane >= (VSZ/2)) {
+                        res = res | ((uint32_t)(1) << (lane - (VSZ/2)));
+                    }
+                }
+            }
+        }
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                dest.set<int>(w, lane, res);
+            }
+        }
+    }
+
+    void
+    Call::MagicJoinWFBar(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int max_cnt = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                w->bar_cnt[lane]++;
+
+                if (w->bar_cnt[lane] > max_cnt) {
+                    max_cnt = w->bar_cnt[lane];
+                }
+            }
+        }
+
+        if (max_cnt > w->max_bar_cnt) {
+            w->max_bar_cnt = max_cnt;
+        }
+    }
+
+    void
+    Call::MagicWaitWFBar(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int max_cnt = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                w->bar_cnt[lane]--;
+            }
+
+            if (w->bar_cnt[lane] > max_cnt) {
+                max_cnt = w->bar_cnt[lane];
+            }
+        }
+
+        if (max_cnt < w->max_bar_cnt) {
+            w->max_bar_cnt = max_cnt;
+        }
+
+        w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
+                                   w->instructionBuffer.end());
+        if (w->pendingFetch)
+            w->dropFetch = true;
+    }
+
+    void
+    Call::MagicPanic(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
+                      src_val1, lane);
+            }
+        }
+    }
+
+    void
+    Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
+    {
+        // the address is in src1 | src2
+        for (int lane = 0; lane < VSZ; ++lane) {
+            int src_val1 = src1.get<int>(w, lane, 1);
+            int src_val2 = src1.get<int>(w, lane, 2);
+            Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
+
+            m->addr[lane] = addr;
+        }
+
+    }
+
+    void
+    Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+    {
+        GPUDynInstPtr m = gpuDynInst;
+
+        calcAddr(w, m);
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
+        }
+
+        m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
+                                        Brig::BRIG_ATOMIC_ADD);
+        m->m_type = U32::memType;
+        m->v_type = U32::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = 0;  // atomics don't have an equivalence class operand
+        m->n_reg = 1;
+        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+        m->scope = Enums::MEMORY_SCOPE_NONE;
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        m->s_type = SEG_GLOBAL;
+        m->pipeId = GLBMEM_PIPE;
+        m->latency.set(w->computeUnit->shader->ticks(64));
+        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->outstanding_reqs_wr_gm++;
+        w->wr_gm_reqs_in_pipe--;
+        w->outstanding_reqs_rd_gm++;
+        w->rd_gm_reqs_in_pipe--;
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    void
+    Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+    {
+        GPUDynInstPtr m = gpuDynInst;
+        calcAddr(w, m);
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
+        }
+
+        m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
+                                        Brig::BRIG_ATOMIC_ADD);
+        m->m_type = U32::memType;
+        m->v_type = U32::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = 0;  // atomics don't have an equivalence class operand
+        m->n_reg = 1;
+        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+        m->scope = Enums::MEMORY_SCOPE_NONE;
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        m->s_type = SEG_GLOBAL;
+        m->pipeId = GLBMEM_PIPE;
+        m->latency.set(w->computeUnit->shader->ticks(64));
+        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->outstanding_reqs_wr_gm++;
+        w->wr_gm_reqs_in_pipe--;
+        w->outstanding_reqs_rd_gm++;
+        w->rd_gm_reqs_in_pipe--;
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    void
+    Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+    {
+        GPUDynInstPtr m = gpuDynInst;
+        // calculate the address
+        calcAddr(w, m);
+
+        m->m_op = Enums::MO_LD;
+        m->m_type = U32::memType;  //MemDataType::memType;
+        m->v_type = U32::vgprType; //DestDataType::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = 0;
+        m->n_reg = 1;
+        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+        m->scope = Enums::MEMORY_SCOPE_NONE;
+
+        // FIXME
+        //m->dst_reg = this->dest.regIndex();
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        m->s_type = SEG_GLOBAL;
+        m->pipeId = GLBMEM_PIPE;
+        m->latency.set(w->computeUnit->shader->ticks(1));
+        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->outstanding_reqs_rd_gm++;
+        w->rd_gm_reqs_in_pipe--;
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    void
+    Call::MagicXactCasLd(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int src_val1 = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                src_val1 = src1.get<int>(w, lane, 1);
+                break;
+            }
+        }
+
+        if (!w->computeUnit->xactCasLoadMap.count(src_val1)) {
+            w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue();
+            w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear();
+        }
+
+        w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue
+            .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId));
+    }
+
+    void
+    Call::MagicMostSigThread(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        unsigned mst = true;
+
+        for (int lane = VSZ - 1; lane >= 0; --lane) {
+            if (mask[lane]) {
+                dest.set<int>(w, lane, mst);
+                mst = false;
+            }
+        }
+    }
+
+    void
+    Call::MagicMostSigBroadcast(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+        bool got_res = false;
+
+        for (int lane = VSZ - 1; lane >= 0; --lane) {
+            if (mask[lane]) {
+                if (!got_res) {
+                    res = src1.get<int>(w, lane, 1);
+                    got_res = true;
+                }
+                dest.set<int>(w, lane, res);
+            }
+        }
+    }
+
+} // namespace HsailISA
diff --git a/src/arch/hsail/operand.cc b/src/arch/hsail/operand.cc
new file mode 100644
index 000000000..d0e6c5541
--- /dev/null
+++ b/src/arch/hsail/operand.cc
@@ -0,0 +1,449 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/operand.hh"
+
+using namespace Brig;
+
+bool
+BaseRegOperand::init(unsigned opOffset, const BrigObject *obj,
+                     unsigned &maxRegIdx, char _regFileChar)
+{
+    regFileChar = _regFileChar;
+    const BrigOperand *brigOp = obj->getOperand(opOffset);
+
+    if (brigOp->kind != BRIG_KIND_OPERAND_REGISTER)
+        return false;
+
+    const BrigOperandRegister *brigRegOp = (const BrigOperandRegister*)brigOp;
+
+    regIdx = brigRegOp->regNum;
+
+    DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d\n", regIdx,
+            brigRegOp->regKind);
+
+    maxRegIdx = std::max(maxRegIdx, regIdx);
+
+    return true;
+}
+
+void
+ListOperand::init(unsigned opOffset, const BrigObject *obj)
+{
+    const BrigOperand *brigOp = (const BrigOperand*)obj->getOperand(opOffset);
+
+    switch (brigOp->kind) {
+      case BRIG_KIND_OPERAND_CODE_LIST:
+        {
+            const BrigOperandCodeList *opList =
+                (const BrigOperandCodeList*)brigOp;
+
+            const Brig::BrigData *oprnd_data =
+                obj->getBrigBaseData(opList->elements);
+
+            // Note: for calls Dest list of operands could be size of 0.
+            elementCount = oprnd_data->byteCount / 4;
+
+            DPRINTF(GPUReg, "Operand Code List: # elements: %d\n",
+                    elementCount);
+
+            for (int i = 0; i < elementCount; ++i) {
+                unsigned *data_offset =
+                    (unsigned*)obj->getData(opList->elements + 4 * (i + 1));
+
+                const BrigDirectiveVariable *p =
+                    (const BrigDirectiveVariable*)obj->
+                    getCodeSectionEntry(*data_offset);
+
+                StorageElement *se = obj->currentCode->storageMap->
+                    findSymbol(BRIG_SEGMENT_ARG, p);
+
+                assert(se);
+                callArgs.push_back(se);
+            }
+        }
+        break;
+      default:
+        fatal("ListOperand: bad operand kind %d\n", brigOp->kind);
+    }
+}
+
+std::string
+ListOperand::disassemble()
+{
+    std::string res_str("");
+
+    for (auto it : callArgs) {
+        res_str += csprintf("%s ", it->name.c_str());
+    }
+
+    return res_str;
+}
+
+void
+FunctionRefOperand::init(unsigned opOffset, const BrigObject *obj)
+{
+    const BrigOperand *baseOp = obj->getOperand(opOffset);
+
+    if (baseOp->kind != BRIG_KIND_OPERAND_CODE_REF) {
+        fatal("FunctionRefOperand: bad operand kind %d\n", baseOp->kind);
+    }
+
+    const BrigOperandCodeRef *brigOp = (const BrigOperandCodeRef*)baseOp;
+
+    const BrigDirectiveExecutable *p =
+        (const BrigDirectiveExecutable*)obj->getCodeSectionEntry(brigOp->ref);
+
+    func_name = obj->getString(p->name);
+}
+
+std::string
+FunctionRefOperand::disassemble()
+{
+    DPRINTF(GPUReg, "Operand Func-ref name: %s\n", func_name);
+
+    return csprintf("%s", func_name);
+}
+
+bool
+BaseRegOperand::init_from_vect(unsigned opOffset, const BrigObject *obj,
+                               int at, unsigned &maxRegIdx, char _regFileChar)
+{
+    regFileChar = _regFileChar;
+    const BrigOperand *brigOp = obj->getOperand(opOffset);
+
+    if (brigOp->kind != BRIG_KIND_OPERAND_OPERAND_LIST)
+        return false;
+
+
+    const Brig::BrigOperandOperandList *brigRegVecOp =
+         (const Brig::BrigOperandOperandList*)brigOp;
+
+    unsigned *data_offset =
+        (unsigned*)obj->getData(brigRegVecOp->elements + 4 * (at + 1));
+
+    const BrigOperand *p =
+        (const BrigOperand*)obj->getOperand(*data_offset);
+    if (p->kind != BRIG_KIND_OPERAND_REGISTER) {
+        return false;
+    }
+
+    const BrigOperandRegister *brigRegOp =(const BrigOperandRegister*)p;
+
+    regIdx = brigRegOp->regNum;
+
+    DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d \n", regIdx,
+            brigRegOp->regKind);
+
+    maxRegIdx = std::max(maxRegIdx, regIdx);
+
+    return true;
+}
+
+void
+BaseRegOperand::initWithStrOffset(unsigned strOffset, const BrigObject *obj,
+                     unsigned &maxRegIdx, char _regFileChar)
+{
+    const char *name = obj->getString(strOffset);
+    char *endptr;
+    regIdx = strtoul(name + 2, &endptr, 10);
+
+    if (name[0] != '$' || name[1] != _regFileChar) {
+        fatal("register operand parse error on \"%s\"\n", name);
+    }
+
+    maxRegIdx = std::max(maxRegIdx, regIdx);
+}
+
+unsigned SRegOperand::maxRegIdx;
+unsigned DRegOperand::maxRegIdx;
+unsigned CRegOperand::maxRegIdx;
+
+std::string
+SRegOperand::disassemble()
+{
+    return csprintf("$s%d", regIdx);
+}
+
+std::string
+DRegOperand::disassemble()
+{
+    return csprintf("$d%d", regIdx);
+}
+
+std::string
+CRegOperand::disassemble()
+{
+    return csprintf("$c%d", regIdx);
+}
+
+BrigRegOperandInfo
+findRegDataType(unsigned opOffset, const BrigObject *obj)
+{
+    const BrigOperand *baseOp = obj->getOperand(opOffset);
+
+    switch (baseOp->kind) {
+      case BRIG_KIND_OPERAND_REGISTER:
+        {
+            const BrigOperandRegister *op = (BrigOperandRegister*)baseOp;
+
+            return BrigRegOperandInfo((BrigKind16_t)baseOp->kind,
+                                      (BrigRegisterKind)op->regKind);
+        }
+        break;
+
+      case BRIG_KIND_OPERAND_OPERAND_LIST:
+        {
+             const BrigOperandOperandList *op =
+                (BrigOperandOperandList*)baseOp;
+             const BrigData *data_p = (BrigData*)obj->getData(op->elements);
+
+
+             int num_operands = 0;
+             BrigRegisterKind reg_kind = (BrigRegisterKind)0;
+             for (int offset = 0; offset < data_p->byteCount; offset += 4) {
+                 const BrigOperand *op_p = (const BrigOperand *)
+                    obj->getOperand(((int *)data_p->bytes)[offset/4]);
+
+                 if (op_p->kind == BRIG_KIND_OPERAND_REGISTER) {
+                     const BrigOperandRegister *brigRegOp =
+                        (const BrigOperandRegister*)op_p;
+                     reg_kind = (BrigRegisterKind)brigRegOp->regKind;
+                 } else if (op_p->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+                     uint16_t num_bytes =
+                        ((Brig::BrigOperandConstantBytes*)op_p)->base.byteCount
+                            - sizeof(BrigBase);
+                     if (num_bytes == sizeof(uint32_t)) {
+                         reg_kind = BRIG_REGISTER_KIND_SINGLE;
+                     } else if (num_bytes == sizeof(uint64_t)) {
+                         reg_kind = BRIG_REGISTER_KIND_DOUBLE;
+                     } else {
+                         fatal("OperandList: bad operand size %d\n", num_bytes);
+                     }
+                 } else {
+                     fatal("OperandList: bad operand kind %d\n", op_p->kind);
+                 }
+
+                 num_operands++;
+             }
+             assert(baseOp->kind == BRIG_KIND_OPERAND_OPERAND_LIST);
+
+             return BrigRegOperandInfo((BrigKind16_t)baseOp->kind, reg_kind);
+        }
+        break;
+
+      case BRIG_KIND_OPERAND_ADDRESS:
+        {
+            const BrigOperandAddress *op = (BrigOperandAddress*)baseOp;
+
+            if (!op->reg) {
+                BrigType type = BRIG_TYPE_NONE;
+
+                if (op->symbol) {
+                    const BrigDirective *dir = (BrigDirective*)
+                        obj->getCodeSectionEntry(op->symbol);
+
+                    assert(dir->kind == BRIG_KIND_DIRECTIVE_VARIABLE);
+
+                    const BrigDirectiveVariable *sym =
+                       (const BrigDirectiveVariable*)dir;
+
+                    type = (BrigType)sym->type;
+                }
+                return BrigRegOperandInfo(BRIG_KIND_OPERAND_ADDRESS,
+                                          (BrigType)type);
+            } else {
+                const BrigOperandAddress *b = (const BrigOperandAddress*)baseOp;
+                const BrigOperand *reg = obj->getOperand(b->reg);
+                const BrigOperandRegister *rop = (BrigOperandRegister*)reg;
+
+                return BrigRegOperandInfo(BRIG_KIND_OPERAND_REGISTER,
+                                          (BrigRegisterKind)rop->regKind);
+            }
+        }
+        break;
+
+     default:
+       fatal("AddrOperand: bad operand kind %d\n", baseOp->kind);
+       break;
+   }
+}
+
+void
+AddrOperandBase::parseAddr(const BrigOperandAddress *op, const BrigObject *obj)
+{
+    assert(op->base.kind == BRIG_KIND_OPERAND_ADDRESS);
+
+    const BrigDirective *d =
+        (BrigDirective*)obj->getCodeSectionEntry(op->symbol);
+
+    assert(d->kind == BRIG_KIND_DIRECTIVE_VARIABLE);
+    const BrigDirectiveVariable *sym = (BrigDirectiveVariable*)d;
+    name = obj->getString(sym->name);
+
+    if (sym->segment != BRIG_SEGMENT_ARG) {
+        storageElement =
+            obj->currentCode->storageMap->findSymbol(sym->segment, name);
+        assert(storageElement);
+        offset = 0;
+    } else {
+        // sym->name does not work for BRIG_SEGMENT_ARG for the following case:
+        //
+        //     void foo(int a);
+        //     void bar(double a);
+        //
+        //     foo(...) --> arg_u32 %param_p0;
+        //                  st_arg_u32 $s0, [%param_p0];
+        //                  call &foo (%param_p0);
+        //     bar(...) --> arg_f64 %param_p0;
+        //                  st_arg_u64 $d0, [%param_p0];
+        //                  call &foo (%param_p0);
+        //
+        //  Both functions use the same variable name (param_p0)!!!
+        //
+        //  Maybe this is a bug in the compiler (I don't know).
+        //
+        // Solution:
+        // Use directive pointer (BrigDirectiveVariable) to differentiate 2
+        // versions of param_p0.
+        //
+        // Note this solution is kind of stupid, because we are pulling stuff
+        // out of the brig binary via the directive pointer and putting it into
+        // the symbol table, but now we are indexing the symbol table by the
+        // brig directive pointer! It makes the symbol table sort of pointless.
+        // But I don't want to mess with the rest of the infrastructure, so
+        // let's go with this for now.
+        //
+        // When we update the compiler again, we should see if this problem goes
+        // away. If so, we can fold some of this functionality into the code for
+        // kernel arguments. If not, maybe we can index the symbol name on a
+        // hash of the variable AND function name
+        storageElement = obj->currentCode->
+                 storageMap->findSymbol((Brig::BrigSegment)sym->segment, sym);
+
+        assert(storageElement);
+    }
+}
+
+uint64_t
+AddrOperandBase::calcUniformBase()
+{
+    // start with offset, will be 0 if not specified
+    uint64_t address = offset;
+
+    // add in symbol value if specified
+    if (storageElement) {
+        address += storageElement->offset;
+    }
+
+    return address;
+}
+
+std::string
+AddrOperandBase::disassemble(std::string reg_disassembly)
+{
+    std::string disasm;
+
+    if (offset || reg_disassembly != "") {
+        disasm += "[";
+
+        if (reg_disassembly != "") {
+            disasm += reg_disassembly;
+
+            if (offset > 0) {
+                disasm += "+";
+            }
+        }
+
+        if (offset) {
+            disasm += csprintf("%d", offset);
+        }
+
+        disasm += "]";
+    } else if (name) {
+        disasm += csprintf("[%s]", name);
+    }
+
+    return disasm;
+}
+
+void
+NoRegAddrOperand::init(unsigned opOffset, const BrigObject *obj)
+{
+    const BrigOperand *baseOp = obj->getOperand(opOffset);
+
+    if (baseOp->kind == BRIG_KIND_OPERAND_ADDRESS) {
+        BrigOperandAddress *addrOp = (BrigOperandAddress*)baseOp;
+        parseAddr(addrOp, obj);
+        offset = (uint64_t(addrOp->offset.hi) << 32) |
+                  uint64_t(addrOp->offset.lo);
+    } else {
+        fatal("NoRegAddrOperand: bad operand kind %d\n", baseOp->kind);
+    }
+
+}
+
+std::string
+NoRegAddrOperand::disassemble()
+{
+    return AddrOperandBase::disassemble(std::string(""));
+}
+
+void
+LabelOperand::init(unsigned opOffset, const BrigObject *obj)
+{
+    const BrigOperandCodeRef *op =
+        (const BrigOperandCodeRef*)obj->getOperand(opOffset);
+
+    assert(op->base.kind == BRIG_KIND_OPERAND_CODE_REF);
+
+    const BrigDirective *dir =
+        (const BrigDirective*)obj->getCodeSectionEntry(op->ref);
+
+    assert(dir->kind == BRIG_KIND_DIRECTIVE_LABEL);
+    label = obj->currentCode->refLabel((BrigDirectiveLabel*)dir, obj);
+}
+
+uint32_t
+LabelOperand::getTarget(Wavefront *w, int lane)
+{
+    return label->get();
+}
+
+std::string
+LabelOperand::disassemble()
+{
+    return label->name;
+}
diff --git a/src/arch/hsail/operand.hh b/src/arch/hsail/operand.hh
new file mode 100644
index 000000000..e3d275b10
--- /dev/null
+++ b/src/arch/hsail/operand.hh
@@ -0,0 +1,768 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_OPERAND_HH__
+#define __ARCH_HSAIL_OPERAND_HH__
+
+/**
+ *  @file operand.hh
+ *
+ *  Defines classes encapsulating HSAIL instruction operands.
+ */
+
+#include <string>
+
+#include "arch/hsail/Brig.h"
+#include "base/trace.hh"
+#include "base/types.hh"
+#include "debug/GPUReg.hh"
+#include "enums/RegisterType.hh"
+#include "gpu-compute/brig_object.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/hsail_code.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+class Label;
+class StorageElement;
+
+class BaseOperand
+{
+  public:
+    Enums::RegisterType registerType;
+    uint32_t regOperandSize;
+    BaseOperand() { registerType = Enums::RT_NONE; regOperandSize = 0; }
+    bool isVectorRegister() { return registerType == Enums::RT_VECTOR; }
+    bool isScalarRegister() { return registerType == Enums::RT_SCALAR; }
+    bool isCondRegister() { return registerType == Enums::RT_CONDITION; }
+    unsigned int regIndex() { return 0; }
+    uint32_t opSize() { return regOperandSize; }
+    virtual ~BaseOperand() { }
+};
+
+class BrigRegOperandInfo
+{
+  public:
+    Brig::BrigKind16_t kind;
+    Brig::BrigType type;
+    Brig::BrigRegisterKind regKind;
+
+    BrigRegOperandInfo(Brig::BrigKind16_t _kind,
+                       Brig::BrigRegisterKind _regKind)
+        : kind(_kind), regKind(_regKind)
+    {
+    }
+
+    BrigRegOperandInfo(Brig::BrigKind16_t _kind, Brig::BrigType _type)
+        : kind(_kind), type(_type)
+    {
+    }
+
+    BrigRegOperandInfo() : kind(Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES),
+                           type(Brig::BRIG_TYPE_NONE)
+    {
+    }
+};
+
+BrigRegOperandInfo findRegDataType(unsigned opOffset, const BrigObject *obj);
+
+class BaseRegOperand : public BaseOperand
+{
+  public:
+    unsigned regIdx;
+    char regFileChar;
+
+    bool init(unsigned opOffset, const BrigObject *obj,
+              unsigned &maxRegIdx, char _regFileChar);
+
+    bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at,
+                        unsigned &maxRegIdx, char _regFileChar);
+
+    void initWithStrOffset(unsigned strOffset, const BrigObject *obj,
+                           unsigned &maxRegIdx, char _regFileChar);
+    unsigned int regIndex() { return regIdx; }
+};
+
+class SRegOperand : public BaseRegOperand
+{
+  public:
+    static unsigned maxRegIdx;
+
+    bool
+    init(unsigned opOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint32_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::init(opOffset, obj, maxRegIdx, 's');
+    }
+
+    bool
+    init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
+    {
+        regOperandSize = sizeof(uint32_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
+                                              's');
+    }
+
+    void
+    initWithStrOffset(unsigned strOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint32_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
+                                                 's');
+    }
+
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane)
+    {
+        assert(sizeof(OperandType) <= sizeof(uint32_t));
+        assert(regIdx < w->maxSpVgprs);
+        // if OperandType is smaller than 32-bit, we truncate the value
+        OperandType ret;
+        uint32_t vgprIdx;
+
+        switch (sizeof(OperandType)) {
+          case 1: // 1 byte operand
+              vgprIdx = w->remap(regIdx, 1, 1);
+              ret = (w->computeUnit->vrf[w->simdId]->
+                      read<uint32_t>(vgprIdx, lane)) & 0xff;
+            break;
+          case 2: // 2 byte operand
+              vgprIdx = w->remap(regIdx, 2, 1);
+              ret = (w->computeUnit->vrf[w->simdId]->
+                      read<uint32_t>(vgprIdx, lane)) & 0xffff;
+            break;
+          case 4: // 4 byte operand
+              vgprIdx = w->remap(regIdx,sizeof(OperandType), 1);
+              ret = w->computeUnit->vrf[w->simdId]->
+                  read<OperandType>(vgprIdx, lane);
+            break;
+          default:
+            panic("Bad OperandType\n");
+            break;
+        }
+
+        return (OperandType)ret;
+    }
+
+    // special get method for compatibility with LabelOperand
+    uint32_t
+    getTarget(Wavefront *w, int lane)
+    {
+        return get<uint32_t>(w, lane);
+    }
+
+    template<typename OperandType>
+    void set(Wavefront *w, int lane, OperandType &val);
+    std::string disassemble();
+};
+
+template<typename OperandType>
+void
+SRegOperand::set(Wavefront *w, int lane, OperandType &val)
+{
+    DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n",
+            w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val);
+
+    assert(sizeof(OperandType) == sizeof(uint32_t));
+    assert(regIdx < w->maxSpVgprs);
+    uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
+    w->computeUnit->vrf[w->simdId]->write<OperandType>(vgprIdx,val,lane);
+}
+
+template<>
+inline void
+SRegOperand::set(Wavefront *w, int lane, uint64_t &val)
+{
+    DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n",
+            w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val);
+
+    assert(regIdx < w->maxSpVgprs);
+    uint32_t vgprIdx = w->remap(regIdx, sizeof(uint32_t), 1);
+    w->computeUnit->vrf[w->simdId]->write<uint32_t>(vgprIdx, val, lane);
+}
+
+class DRegOperand : public BaseRegOperand
+{
+  public:
+    static unsigned maxRegIdx;
+
+    bool
+    init(unsigned opOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint64_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'd');
+    }
+
+    bool
+    init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
+    {
+        regOperandSize = sizeof(uint64_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
+                                              'd');
+    }
+
+    void
+    initWithStrOffset(unsigned strOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint64_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
+                                                 'd');
+    }
+
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane)
+    {
+        assert(sizeof(OperandType) <= sizeof(uint64_t));
+        // TODO: this check is valid only for HSAIL
+        assert(regIdx < w->maxDpVgprs);
+        uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
+
+        return w->computeUnit->vrf[w->simdId]->read<OperandType>(vgprIdx,lane);
+    }
+
+    template<typename OperandType>
+    void
+    set(Wavefront *w, int lane, OperandType &val)
+    {
+        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $d%d <- %d\n",
+                w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx,
+                val);
+
+        assert(sizeof(OperandType) <= sizeof(uint64_t));
+        // TODO: this check is valid only for HSAIL
+        assert(regIdx < w->maxDpVgprs);
+        uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
+        w->computeUnit->vrf[w->simdId]->write<OperandType>(vgprIdx,val,lane);
+    }
+
+    std::string disassemble();
+};
+
+class CRegOperand : public BaseRegOperand
+{
+  public:
+    static unsigned maxRegIdx;
+
+    bool
+    init(unsigned opOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint8_t);
+        registerType = Enums::RT_CONDITION;
+
+        return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'c');
+    }
+
+    bool
+    init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
+    {
+        regOperandSize = sizeof(uint8_t);
+        registerType = Enums::RT_CONDITION;
+
+        return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
+                                              'c');
+    }
+
+    void
+    initWithStrOffset(unsigned strOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint8_t);
+        registerType = Enums::RT_CONDITION;
+
+        return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
+                                                 'c');
+    }
+
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane)
+    {
+        assert(regIdx < w->condRegState->numRegs());
+
+        return w->condRegState->read<OperandType>((int)regIdx, lane);
+    }
+
+    template<typename OperandType>
+    void
+    set(Wavefront *w, int lane, OperandType &val)
+    {
+        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $c%d <- %d\n",
+                w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx,
+                val);
+
+        assert(regIdx < w->condRegState->numRegs());
+        w->condRegState->write<OperandType>(regIdx,lane,val);
+    }
+
+    std::string disassemble();
+};
+
+template<typename T>
+class ImmOperand : public BaseOperand
+{
+  public:
+    T bits;
+
+    bool init(unsigned opOffset, const BrigObject *obj);
+    bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at);
+    std::string disassemble();
+
+    template<typename OperandType>
+    OperandType
+    get()
+    {
+        assert(sizeof(OperandType) <= sizeof(T));
+
+        return *(OperandType*)&bits;
+    }
+
+    // This version of get() takes a WF* and a lane id for
+    // compatibility with the register-based get() methods.
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane)
+    {
+        return get<OperandType>();
+    }
+};
+
+template<typename T>
+bool
+ImmOperand<T>::init(unsigned opOffset, const BrigObject *obj)
+{
+    const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
+
+    switch (brigOp->kind) {
+      // this is immediate operand
+      case Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES:
+        {
+            DPRINTF(GPUReg, "sizeof(T): %lu, byteCount: %d\n", sizeof(T),
+                    brigOp->byteCount);
+
+            auto cbptr = (Brig::BrigOperandConstantBytes*)brigOp;
+
+            bits = *((T*)(obj->getData(cbptr->bytes + 4)));
+
+            return true;
+        }
+        break;
+
+      case Brig::BRIG_KIND_OPERAND_WAVESIZE:
+        bits = VSZ;
+        return true;
+
+      default:
+        return false;
+    }
+}
+
+template <typename T>
+bool
+ImmOperand<T>::init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
+{
+    const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
+
+    if (brigOp->kind != Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+        return false;
+    }
+
+
+    const Brig::BrigOperandOperandList *brigVecOp =
+         (const Brig::BrigOperandOperandList *)brigOp;
+
+    unsigned *data_offset =
+        (unsigned *)obj->getData(brigVecOp->elements + 4 * (at + 1));
+
+    const Brig::BrigOperand *p =
+        (const Brig::BrigOperand *)obj->getOperand(*data_offset);
+
+    if (p->kind != Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+        return false;
+    }
+
+    return init(*data_offset, obj);
+}
+template<typename T>
+std::string
+ImmOperand<T>::disassemble()
+{
+    return csprintf("0x%08x", bits);
+}
+
+template<typename RegOperand, typename T>
+class RegOrImmOperand : public BaseOperand
+{
+  private:
+    bool is_imm;
+
+  public:
+    void setImm(const bool value) { is_imm = value; }
+
+    ImmOperand<T> imm_op;
+    RegOperand reg_op;
+
+    RegOrImmOperand() { is_imm = false; }
+    void init(unsigned opOffset, const BrigObject *obj);
+    void init_from_vect(unsigned opOffset, const BrigObject *obj, int at);
+    std::string disassemble();
+
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane)
+    {
+        return is_imm ?  imm_op.template get<OperandType>() :
+                         reg_op.template get<OperandType>(w, lane);
+    }
+
+    uint32_t
+    opSize()
+    {
+        if (!is_imm) {
+            return reg_op.opSize();
+        }
+
+        return 0;
+    }
+
+    bool
+    isVectorRegister()
+    {
+        if (!is_imm) {
+            return reg_op.registerType == Enums::RT_VECTOR;
+        }
+        return false;
+    }
+
+    bool
+    isCondRegister()
+    {
+        if (!is_imm) {
+            return reg_op.registerType == Enums::RT_CONDITION;
+        }
+
+        return false;
+    }
+
+    bool
+    isScalarRegister()
+    {
+        if (!is_imm) {
+            return reg_op.registerType == Enums::RT_SCALAR;
+        }
+
+        return false;
+    }
+
+    unsigned int
+    regIndex()
+    {
+        if (!is_imm) {
+            return reg_op.regIndex();
+        }
+        return 0;
+    }
+};
+
+template<typename RegOperand, typename T>
+void
+RegOrImmOperand<RegOperand, T>::init(unsigned opOffset, const BrigObject *obj)
+{
+    is_imm = false;
+
+    if (reg_op.init(opOffset, obj)) {
+        return;
+    }
+
+    if (imm_op.init(opOffset, obj)) {
+        is_imm = true;
+        return;
+    }
+
+    fatal("RegOrImmOperand::init(): bad operand kind %d\n",
+          obj->getOperand(opOffset)->kind);
+}
+
+template<typename RegOperand, typename T>
+void
+RegOrImmOperand<RegOperand, T>::init_from_vect(unsigned opOffset,
+                                               const BrigObject *obj, int at)
+{
+    if (reg_op.init_from_vect(opOffset, obj, at)) {
+        is_imm = false;
+
+        return;
+    }
+
+    if (imm_op.init_from_vect(opOffset, obj, at)) {
+        is_imm = true;
+
+        return;
+    }
+
+    fatal("RegOrImmOperand::init(): bad operand kind %d\n",
+          obj->getOperand(opOffset)->kind);
+}
+
+template<typename RegOperand, typename T>
+std::string
+RegOrImmOperand<RegOperand, T>::disassemble()
+{
+    return is_imm ? imm_op.disassemble() : reg_op.disassemble();
+}
+
+typedef RegOrImmOperand<SRegOperand, uint32_t> SRegOrImmOperand;
+typedef RegOrImmOperand<DRegOperand, uint64_t> DRegOrImmOperand;
+typedef RegOrImmOperand<CRegOperand, bool> CRegOrImmOperand;
+
+class AddrOperandBase : public BaseOperand
+{
+  protected:
+    // helper function for init()
+    void parseAddr(const Brig::BrigOperandAddress *op, const BrigObject *obj);
+
+    // helper function for disassemble()
+    std::string disassemble(std::string reg_disassembly);
+    uint64_t calcUniformBase();
+
+  public:
+    virtual void calcVector(Wavefront *w, uint64_t *addrVec) = 0;
+    virtual uint64_t calcLane(Wavefront *w, int lane=0) = 0;
+
+    uint64_t offset;
+    const char *name = nullptr;
+    StorageElement *storageElement;
+};
+
+template<typename RegOperandType>
+class RegAddrOperand : public AddrOperandBase
+{
+  public:
+    RegOperandType reg;
+    void init(unsigned opOffset, const BrigObject *obj);
+    uint64_t calcUniform();
+    void calcVector(Wavefront *w, uint64_t *addrVec);
+    uint64_t calcLane(Wavefront *w, int lane=0);
+    uint32_t opSize() { return reg.opSize(); }
+    bool isVectorRegister() { return reg.registerType == Enums::RT_VECTOR; }
+    bool isCondRegister() { return reg.registerType == Enums::RT_CONDITION; }
+    bool isScalarRegister() { return reg.registerType == Enums::RT_SCALAR; }
+    unsigned int regIndex() { return reg.regIndex(); }
+    std::string disassemble();
+};
+
+template<typename RegOperandType>
+void
+RegAddrOperand<RegOperandType>::init(unsigned opOffset, const BrigObject *obj)
+{
+    using namespace Brig;
+
+    const BrigOperand *baseOp = obj->getOperand(opOffset);
+
+    switch (baseOp->kind) {
+      case BRIG_KIND_OPERAND_ADDRESS:
+        {
+            const BrigOperandAddress *op = (BrigOperandAddress*)baseOp;
+            storageElement = nullptr;
+
+            offset = (uint64_t(op->offset.hi) << 32) | uint64_t(op->offset.lo);
+            reg.init(op->reg, obj);
+
+            if (reg.regFileChar == 's') {
+                reg.regOperandSize = sizeof(uint32_t);
+                registerType = Enums::RT_VECTOR;
+            }
+            else if (reg.regFileChar == 'd') {
+                reg.regOperandSize = sizeof(uint64_t);
+                registerType = Enums::RT_VECTOR;
+            }
+        }
+        break;
+
+      default:
+        fatal("RegAddrOperand: bad operand kind %d\n", baseOp->kind);
+        break;
+    }
+}
+
+template<typename RegOperandType>
+uint64_t
+RegAddrOperand<RegOperandType>::calcUniform()
+{
+    fatal("can't do calcUniform() on register-based address\n");
+
+    return 0;
+}
+
+template<typename RegOperandType>
+void
+RegAddrOperand<RegOperandType>::calcVector(Wavefront *w, uint64_t *addrVec)
+{
+    Addr address = calcUniformBase();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (w->execMask(lane)) {
+            if (reg.regFileChar == 's') {
+                addrVec[lane] = address + reg.template get<uint32_t>(w, lane);
+            } else {
+                addrVec[lane] = address + reg.template get<Addr>(w, lane);
+            }
+        }
+    }
+}
+
+template<typename RegOperandType>
+uint64_t
+RegAddrOperand<RegOperandType>::calcLane(Wavefront *w, int lane)
+{
+    Addr address = calcUniformBase();
+
+    return address + reg.template get<Addr>(w, lane);
+}
+
+template<typename RegOperandType>
+std::string
+RegAddrOperand<RegOperandType>::disassemble()
+{
+    return AddrOperandBase::disassemble(reg.disassemble());
+}
+
+typedef RegAddrOperand<SRegOperand> SRegAddrOperand;
+typedef RegAddrOperand<DRegOperand> DRegAddrOperand;
+
+class NoRegAddrOperand : public AddrOperandBase
+{
+  public:
+    void init(unsigned opOffset, const BrigObject *obj);
+    uint64_t calcUniform();
+    void calcVector(Wavefront *w, uint64_t *addrVec);
+    uint64_t calcLane(Wavefront *w, int lane=0);
+    std::string disassemble();
+};
+
+inline uint64_t
+NoRegAddrOperand::calcUniform()
+{
+    return AddrOperandBase::calcUniformBase();
+}
+
+inline uint64_t
+NoRegAddrOperand::calcLane(Wavefront *w, int lane)
+{
+    return calcUniform();
+}
+
+inline void
+NoRegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec)
+{
+    uint64_t address = calcUniformBase();
+
+    for (int lane = 0; lane < VSZ; ++lane)
+        addrVec[lane] = address;
+}
+
+class LabelOperand : public BaseOperand
+{
+  public:
+    Label *label;
+
+    void init(unsigned opOffset, const BrigObject *obj);
+    std::string disassemble();
+
+    // special get method for compatibility with SRegOperand
+    uint32_t getTarget(Wavefront *w, int lane);
+
+};
+
+class ListOperand : public BaseOperand
+{
+  public:
+    int elementCount;
+    std::vector<StorageElement*> callArgs;
+
+    int
+    getSrcOperand(int idx)
+    {
+        DPRINTF(GPUReg, "getSrcOperand, idx: %d, sz_args: %d\n", idx,
+                callArgs.size());
+
+        return callArgs.at(idx)->offset;
+    }
+
+    void init(unsigned opOffset, const BrigObject *obj);
+
+    std::string disassemble();
+
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane, int arg_idx)
+    {
+        return w->readCallArgMem<OperandType>(lane, getSrcOperand(arg_idx));
+    }
+
+    template<typename OperandType>
+    void
+    set(Wavefront *w, int lane, OperandType val)
+    {
+        w->writeCallArgMem<OperandType>(lane, getSrcOperand(0), val);
+        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: arg[%d] <- %d\n",
+                w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane,
+                getSrcOperand(0), val);
+    }
+};
+
+class FunctionRefOperand : public BaseOperand
+{
+  public:
+    const char *func_name;
+
+    void init(unsigned opOffset, const BrigObject *obj);
+    std::string disassemble();
+};
+
+#endif // __ARCH_HSAIL_OPERAND_HH__
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
new file mode 100644
index 000000000..bd95f6335
--- /dev/null
+++ b/src/gpu-compute/GPU.py
@@ -0,0 +1,310 @@
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Steve Reinhardt
+#
+
+from ClockedObject import ClockedObject
+from Device import DmaDevice
+from m5.defines import buildEnv
+from m5.params import *
+from m5.proxy import *
+from m5.SimObject import SimObject
+from MemObject import MemObject
+from Process import EmulatedDriver
+from Bridge import Bridge
+from LdsState import LdsState
+
+class PrefetchType(Enum): vals = [
+    'PF_CU',
+    'PF_PHASE',
+    'PF_WF',
+    'PF_STRIDE',
+    'PF_END',
+    ]
+
+class VectorRegisterFile(SimObject):
+    type = 'VectorRegisterFile'
+    cxx_class = 'VectorRegisterFile'
+    cxx_header = 'gpu-compute/vector_register_file.hh'
+
+    simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
+    num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
+    min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
+
+class Wavefront(SimObject):
+    type = 'Wavefront'
+    cxx_class = 'Wavefront'
+    cxx_header = 'gpu-compute/wavefront.hh'
+
+    simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
+    wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
+
+class ComputeUnit(MemObject):
+    type = 'ComputeUnit'
+    cxx_class = 'ComputeUnit'
+    cxx_header = 'gpu-compute/compute_unit.hh'
+
+    wavefronts = VectorParam.Wavefront('Number of wavefronts')
+    wfSize = Param.Int(64, 'Wavefront size (in work items)')
+    num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
+
+    spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
+                                        'latency')
+
+    dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\
+                                        'latency')
+
+    issue_period = Param.Int(4, 'number of cycles per issue period')
+    num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
+    num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
+    n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
+    mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\
+                                "Represents the pipeline to reach the TCP and "\
+                                "specified in GPU clock cycles")
+    mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\
+                                 "cu. Represents the pipeline between the TCP "\
+                                 "and cu as well as TCP data array access. "\
+                                 "Specified in GPU clock cycles")
+    system = Param.System(Parent.any, "system object")
+    cu_id = Param.Int('CU id')
+    vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\
+                                           "in bytes")
+    coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\
+                                           "in bytes")
+
+    memory_port = VectorMasterPort("Port to the memory system")
+    translation_port = VectorMasterPort('Port to the TLB hierarchy')
+    sqc_port = MasterPort("Port to the SQC (I-cache")
+    sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
+    perLaneTLB = Param.Bool(False, "enable per-lane TLB")
+    prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
+                               "(0 turns off prefetching)")
+    prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)")
+    prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\
+                                            "from last mem req in lane of "\
+                                            "CU|Phase|Wavefront")
+    execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
+    xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr.");
+    debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
+    functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
+
+    localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
+                                        "kernel end")
+
+    countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\
+                                   "and how many times")
+    global_mem_queue_size = Param.Int(256, "Number of entries in the global "
+                                      "memory pipeline's queues")
+    local_mem_queue_size = Param.Int(256, "Number of entries in the local "
+                                      "memory pipeline's queues")
+    ldsBus = Bridge() # the bridge between the CU and its LDS
+    ldsPort = MasterPort("The port that goes to the LDS")
+    localDataStore = Param.LdsState("the LDS for this CU")
+
+    vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
+                                                          "file")
+
+class Shader(ClockedObject):
+    type = 'Shader'
+    cxx_class = 'Shader'
+    cxx_header = 'gpu-compute/shader.hh'
+
+    CUs = VectorParam.ComputeUnit('Number of compute units')
+    n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
+    impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
+                                                  ruby at kernel boundaries""")
+    separate_acquire_release = Param.Bool(False,
+        """Do ld_acquire/st_release generate separate requests for the
+        acquire and release?""")
+    globalmem = Param.MemorySize('64kB', 'Memory size')
+    timing = Param.Bool(False, 'timing memory accesses')
+
+    cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
+    translation = Param.Bool(False, "address translation");
+
+class ClDriver(EmulatedDriver):
+    type = 'ClDriver'
+    cxx_header = 'gpu-compute/cl_driver.hh'
+    codefile = VectorParam.String('code file name(s)')
+
+class GpuDispatcher(DmaDevice):
+    type = 'GpuDispatcher'
+    cxx_header = 'gpu-compute/dispatcher.hh'
+    # put at 8GB line for now
+    pio_addr = Param.Addr(0x200000000, "Device Address")
+    pio_latency = Param.Latency('1ns', "Programmed IO latency")
+    shader_pointer = Param.Shader('pointer to shader')
+    translation_port = MasterPort('Port to the dispatcher TLB')
+    cpu = Param.BaseCPU("CPU to wake up on kernel completion")
+
+    cl_driver = Param.ClDriver('pointer to driver')
+
+class OpType(Enum): vals = [
+    'OT_NULL',
+    'OT_ALU',
+    'OT_SPECIAL',
+    'OT_GLOBAL_READ',
+    'OT_GLOBAL_WRITE',
+    'OT_GLOBAL_ATOMIC',
+    'OT_GLOBAL_HIST',
+    'OT_GLOBAL_LDAS',
+    'OT_SHARED_READ',
+    'OT_SHARED_WRITE',
+    'OT_SHARED_ATOMIC',
+    'OT_SHARED_HIST',
+    'OT_SHARED_LDAS',
+    'OT_PRIVATE_READ',
+    'OT_PRIVATE_WRITE',
+    'OT_PRIVATE_ATOMIC',
+    'OT_PRIVATE_HIST',
+    'OT_PRIVATE_LDAS',
+    'OT_SPILL_READ',
+    'OT_SPILL_WRITE',
+    'OT_SPILL_ATOMIC',
+    'OT_SPILL_HIST',
+    'OT_SPILL_LDAS',
+    'OT_READONLY_READ',
+    'OT_READONLY_WRITE',
+    'OT_READONLY_ATOMIC',
+    'OT_READONLY_HIST',
+    'OT_READONLY_LDAS',
+    'OT_FLAT_READ',
+    'OT_FLAT_WRITE',
+    'OT_FLAT_ATOMIC',
+    'OT_FLAT_HIST',
+    'OT_FLAT_LDAS',
+    'OT_KERN_READ',
+    'OT_BRANCH',
+
+    # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version
+    #       of the compiler.
+    'OT_SHARED_MEMFENCE',
+    'OT_GLOBAL_MEMFENCE',
+    'OT_BOTH_MEMFENCE',
+
+    'OT_BARRIER',
+    'OT_PRINT',
+    'OT_RET',
+    'OT_NOP',
+    'OT_ARG'
+    ]
+
+class MemType(Enum): vals = [
+    'M_U8',
+    'M_U16',
+    'M_U32',
+    'M_U64',
+    'M_S8',
+    'M_S16',
+    'M_S32',
+    'M_S64',
+    'M_F16',
+    'M_F32',
+    'M_F64',
+    ]
+
+class MemOpType(Enum): vals = [
+    'MO_LD',
+    'MO_ST',
+    'MO_LDAS',
+    'MO_LDA',
+    'MO_AAND',
+    'MO_AOR',
+    'MO_AXOR',
+    'MO_ACAS',
+    'MO_AEXCH',
+    'MO_AADD',
+    'MO_ASUB',
+    'MO_AINC',
+    'MO_ADEC',
+    'MO_AMAX',
+    'MO_AMIN',
+    'MO_ANRAND',
+    'MO_ANROR',
+    'MO_ANRXOR',
+    'MO_ANRCAS',
+    'MO_ANREXCH',
+    'MO_ANRADD',
+    'MO_ANRSUB',
+    'MO_ANRINC',
+    'MO_ANRDEC',
+    'MO_ANRMAX',
+    'MO_ANRMIN',
+    'MO_HAND',
+    'MO_HOR',
+    'MO_HXOR',
+    'MO_HCAS',
+    'MO_HEXCH',
+    'MO_HADD',
+    'MO_HSUB',
+    'MO_HINC',
+    'MO_HDEC',
+    'MO_HMAX',
+    'MO_HMIN',
+    'MO_UNDEF'
+    ]
+
+class StorageClassType(Enum): vals = [
+    'SC_SPILL',
+    'SC_GLOBAL',
+    'SC_SHARED',
+    'SC_PRIVATE',
+    'SC_READONLY',
+    'SC_KERNARG',
+    'SC_NONE',
+    ]
+
+class RegisterType(Enum): vals = [
+    'RT_VECTOR',
+    'RT_SCALAR',
+    'RT_CONDITION',
+    'RT_HARDWARE',
+    'RT_NONE',
+    ]
+
+class GenericMemoryOrder(Enum): vals = [
+    'MEMORY_ORDER_NONE',
+    'MEMORY_ORDER_RELAXED',
+    'MEMORY_ORDER_SC_ACQUIRE',
+    'MEMORY_ORDER_SC_RELEASE',
+    'MEMORY_ORDER_SC_ACQUIRE_RELEASE',
+    ]
+
+class GenericMemoryScope(Enum): vals = [
+    'MEMORY_SCOPE_NONE',
+    'MEMORY_SCOPE_WORKITEM',
+    'MEMORY_SCOPE_WAVEFRONT',
+    'MEMORY_SCOPE_WORKGROUP',
+    'MEMORY_SCOPE_DEVICE',
+    'MEMORY_SCOPE_SYSTEM',
+    ]
diff --git a/src/gpu-compute/LdsState.py b/src/gpu-compute/LdsState.py
new file mode 100644
index 000000000..6ea9f6427
--- /dev/null
+++ b/src/gpu-compute/LdsState.py
@@ -0,0 +1,51 @@
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Joe Gross
+#
+
+from m5.defines import buildEnv
+from m5.params import *
+from m5.proxy import *
+
+from MemObject import MemObject
+
+class LdsState(MemObject):
+    type = 'LdsState'
+    cxx_class = 'LdsState'
+    cxx_header = 'gpu-compute/lds_state.hh'
+    size = Param.Int(65536, 'the size of the LDS')
+    range = Param.AddrRange('64kB', "address space of the LDS")
+    bankConflictPenalty = Param.Int(1, 'penalty per LDS bank conflict when '\
+                                    'accessing data')
+    banks = Param.Int(32, 'Number of LDS banks')
+    cuPort = SlavePort("port that goes to the compute unit")
diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript
new file mode 100644
index 000000000..2de96df24
--- /dev/null
+++ b/src/gpu-compute/SConscript
@@ -0,0 +1,99 @@
+# -*- mode:python -*-
+
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Anthony Gutierrez
+#
+
+Import('*')
+
+if not env['BUILD_GPU']:
+    Return()
+
+SimObject('GPU.py')
+SimObject('LdsState.py')
+SimObject('X86GPUTLB.py')
+
+if env['TARGET_GPU_ISA'] == 'hsail':
+    Source('brig_object.cc')
+    Source('hsail_code.cc')
+
+Source('cl_driver.cc')
+Source('compute_unit.cc')
+Source('condition_register_state.cc')
+Source('dispatcher.cc')
+Source('exec_stage.cc')
+Source('fetch_stage.cc')
+Source('fetch_unit.cc')
+Source('global_memory_pipeline.cc')
+Source('gpu_dyn_inst.cc')
+Source('gpu_exec_context.cc')
+Source('gpu_static_inst.cc')
+Source('gpu_tlb.cc')
+Source('hsa_object.cc')
+Source('kernel_cfg.cc')
+Source('lds_state.cc')
+Source('local_memory_pipeline.cc')
+Source('of_scheduling_policy.cc')
+Source('pool_manager.cc')
+Source('rr_scheduling_policy.cc')
+Source('schedule_stage.cc')
+Source('scheduler.cc')
+Source('scoreboard_check_stage.cc')
+Source('shader.cc')
+Source('simple_pool_manager.cc')
+Source('tlb_coalescer.cc')
+Source('vector_register_file.cc')
+Source('vector_register_state.cc')
+Source('wavefront.cc')
+
+DebugFlag('BRIG')
+DebugFlag('GPUCoalescer')
+DebugFlag('GPUDisp')
+DebugFlag('GPUExec')
+DebugFlag('GPUFetch')
+DebugFlag('GPUHsailCFInfo')
+DebugFlag('GPUMem')
+DebugFlag('GPUPort')
+DebugFlag('GPUPrefetch')
+DebugFlag('GPUReg')
+DebugFlag('GPUSync')
+DebugFlag('GPUTLB')
+DebugFlag('HSALoader')
+DebugFlag('HSAIL')
+DebugFlag('HSAILObject')
+DebugFlag('Predictor')
+DebugFlag('WavefrontStack')
+
+CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
+                        'GPUMem', 'GPUPort', 'GPUSync', 'GPUTLB', 'HSAIL'])
diff --git a/src/gpu-compute/X86GPUTLB.py b/src/gpu-compute/X86GPUTLB.py
new file mode 100644
index 000000000..51f8e514e
--- /dev/null
+++ b/src/gpu-compute/X86GPUTLB.py
@@ -0,0 +1,77 @@
+#
+#  Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Lisa Hsu
+#
+
+from m5.defines import buildEnv
+from m5.params import *
+from m5.proxy import *
+
+from m5.objects.MemObject import MemObject
+
+if buildEnv['FULL_SYSTEM']:
+    class X86PagetableWalker(MemObject):
+        type = 'X86PagetableWalker'
+        cxx_class = 'X86ISA::Walker'
+        port = SlavePort("Port for the hardware table walker")
+        system = Param.System(Parent.any, "system object")
+
+class X86GPUTLB(MemObject):
+    type = 'X86GPUTLB'
+    cxx_class = 'X86ISA::GpuTLB'
+    cxx_header = 'gpu-compute/gpu_tlb.hh'
+    size = Param.Int(64, "TLB size (number of entries)")
+    assoc = Param.Int(64, "TLB associativity")
+
+    if buildEnv['FULL_SYSTEM']:
+        walker = Param.X86PagetableWalker(X86PagetableWalker(),
+                                          "page table walker")
+
+    hitLatency = Param.Int(2, "Latency of a TLB hit")
+    missLatency1 = Param.Int(5, "Latency #1 of a TLB miss")
+    missLatency2 = Param.Int(100, "Latency #2 of a TLB miss")
+    maxOutstandingReqs = Param.Int(64, "# of maximum outstanding requests")
+    slave = VectorSlavePort("Port on side closer to CPU/CU")
+    master = VectorMasterPort("Port on side closer to memory")
+    allocationPolicy = Param.Bool(True, "Allocate on an access")
+    accessDistance = Param.Bool(False, "print accessDistance stats")
+
+class TLBCoalescer(MemObject):
+    type = 'TLBCoalescer'
+    cxx_class = 'TLBCoalescer'
+    cxx_header = 'gpu-compute/tlb_coalescer.hh'
+    probesPerCycle = Param.Int(2, "Number of TLB probes per cycle")
+    coalescingWindow = Param.Int(1, "Permit coalescing across that many ticks")
+    slave = VectorSlavePort("Port on side closer to CPU/CU")
+    master = VectorMasterPort("Port on side closer to memory")
+    disableCoalescing = Param.Bool(False,"Dispable Coalescing")
diff --git a/src/gpu-compute/brig_object.cc b/src/gpu-compute/brig_object.cc
new file mode 100644
index 000000000..7cc9b7cc4
--- /dev/null
+++ b/src/gpu-compute/brig_object.cc
@@ -0,0 +1,474 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt, Anthony Gutierrez
+ */
+
+#include "gpu-compute/brig_object.hh"
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+
+#include "arch/hsail/Brig.h"
+#include "base/misc.hh"
+#include "base/trace.hh"
+#include "debug/BRIG.hh"
+#include "debug/HSAILObject.hh"
+#include "debug/HSALoader.hh"
+
+using namespace Brig;
+
+std::vector<std::function<HsaObject*(const std::string&, int, uint8_t*)>>
+    HsaObject::tryFileFuncs = { BrigObject::tryFile };
+
+extern int getBrigDataTypeBytes(BrigType16_t t);
+
+const char *BrigObject::sectionNames[] =
+{
+    "hsa_data",
+    "hsa_code",
+    "hsa_operand",
+    ".shstrtab"
+};
+
+const char *segmentNames[] =
+{
+    "none",
+    "flat",
+    "global",
+    "readonly",
+    "kernarg",
+    "group",
+    "private",
+    "spill",
+    "args"
+};
+
+const uint8_t*
+BrigObject::getSectionOffset(enum SectionIndex sec, int offs) const
+{
+    // allow offs == size for dummy end pointers
+    assert(offs <= sectionInfo[sec].size);
+
+    return sectionInfo[sec].ptr + offs;
+}
+
+const char*
+BrigObject::getString(int offs) const
+{
+    return (const char*)(getSectionOffset(DataSectionIndex, offs) + 4);
+}
+
+const BrigBase*
+BrigObject::getCodeSectionEntry(int offs) const
+{
+    return (const BrigBase*)getSectionOffset(CodeSectionIndex, offs);
+}
+
+const BrigData*
+BrigObject::getBrigBaseData(int offs) const
+{
+    return (Brig::BrigData*)(getSectionOffset(DataSectionIndex, offs));
+}
+
+const uint8_t*
+BrigObject::getData(int offs) const
+{
+    return getSectionOffset(DataSectionIndex, offs);
+}
+
+const BrigOperand*
+BrigObject::getOperand(int offs) const
+{
+    return (const BrigOperand*)getSectionOffset(OperandsSectionIndex, offs);
+}
+
+unsigned
+BrigObject::getOperandPtr(int offs, int index) const
+{
+    unsigned *op_offs = (unsigned*)(getData(offs + 4 * (index + 1)));
+
+    return *op_offs;
+}
+
+const BrigInstBase*
+BrigObject::getInst(int offs) const
+{
+    return (const BrigInstBase*)getSectionOffset(CodeSectionIndex, offs);
+}
+
+HsaCode*
+BrigObject::getKernel(const std::string &name) const
+{
+    return nullptr;
+}
+
+HsaCode*
+BrigObject::getFunction(const std::string &name) const
+{
+    for (int i = 0; i < functions.size(); ++i) {
+        if (functions[i]->name() == name) {
+            return functions[i];
+        }
+    }
+
+    return nullptr;
+}
+
+void
+BrigObject::processDirectives(const BrigBase *dirPtr, const BrigBase *endPtr,
+                              StorageMap *storageMap)
+{
+    while (dirPtr < endPtr) {
+        if (!dirPtr->byteCount) {
+            fatal("Bad directive size 0\n");
+        }
+
+        // calculate next pointer now so we can override it if needed
+        const BrigBase *nextDirPtr = brigNext(dirPtr);
+
+        DPRINTF(HSAILObject, "Code section entry kind: #%x, byte count: %d\n",
+                dirPtr->kind, dirPtr->byteCount);
+
+        switch (dirPtr->kind) {
+          case BRIG_KIND_DIRECTIVE_FUNCTION:
+            {
+                const BrigDirectiveExecutable *p M5_VAR_USED =
+                    reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr);
+
+                DPRINTF(HSAILObject,"DIRECTIVE_FUNCTION: %s offset: "
+                        "%d next: %d\n", getString(p->name),
+                        p->firstCodeBlockEntry, p->nextModuleEntry);
+
+                if (p->firstCodeBlockEntry != p->nextModuleEntry) {
+                    panic("Function calls are not fully supported yet!!: %s\n",
+                          getString(p->name));
+
+                    const char *name = getString(p->name);
+
+                    HsailCode *code_obj = nullptr;
+
+                    for (int i = 0; i < functions.size(); ++i) {
+                        if (functions[i]->name() == name) {
+                            code_obj = functions[i];
+                            break;
+                        }
+                    }
+
+                    if (!code_obj) {
+                        // create new local storage map for kernel-local symbols
+                        code_obj = new HsailCode(name, p, this,
+                                                 new StorageMap(storageMap));
+                        functions.push_back(code_obj);
+                    } else {
+                        panic("Multiple definition of Function!!: %s\n",
+                              getString(p->name));
+                    }
+
+                }
+                nextDirPtr = getCodeSectionEntry(p->nextModuleEntry);
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_KERNEL:
+            {
+                const BrigDirectiveExecutable *p =
+                    reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr);
+
+                DPRINTF(HSAILObject,"DIRECTIVE_KERNEL: %s offset: %d count: "
+                        "next: %d\n", getString(p->name),
+                        p->firstCodeBlockEntry, p->nextModuleEntry);
+
+                const char *name = getString(p->name);
+
+                if (name[0] == '&')
+                    name++;
+
+                std::string str = name;
+                char *temp;
+                int len = str.length();
+
+                if (str[len - 1] >= 'a' && str[len - 1] <= 'z') {
+                    temp = new char[str.size() + 1];
+                    std::copy(str.begin(), str.end() , temp);
+                    temp[str.size()] = '\0';
+                } else {
+                    temp = new char[str.size()];
+                    std::copy(str.begin(), str.end() - 1 , temp);
+                    temp[str.size() - 1 ] = '\0';
+                }
+
+                std::string kernel_name = temp;
+                delete[] temp;
+
+                HsailCode *code_obj = nullptr;
+
+                for (const auto &kernel : kernels) {
+                    if (kernel->name() == kernel_name) {
+                        code_obj = kernel;
+                        break;
+                    }
+                }
+
+                if (!code_obj) {
+                    // create new local storage map for kernel-local symbols
+                    code_obj = new HsailCode(kernel_name, p, this,
+                                             new StorageMap(storageMap));
+
+                    kernels.push_back(code_obj);
+                }
+
+                nextDirPtr = getCodeSectionEntry(p->nextModuleEntry);
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_VARIABLE:
+            {
+                const BrigDirectiveVariable *p =
+                    reinterpret_cast<const BrigDirectiveVariable*>(dirPtr);
+
+                uint64_t readonlySize_old =
+                    storageMap->getSize(BRIG_SEGMENT_READONLY);
+
+                StorageElement* se = storageMap->addSymbol(p, this);
+
+                DPRINTF(HSAILObject, "DIRECTIVE_VARIABLE, symbol %s\n",
+                        getString(p->name));
+
+                if (p->segment == BRIG_SEGMENT_READONLY) {
+                    // readonly memory has initialization data
+                    uint8_t* readonlyData_old = readonlyData;
+
+                    readonlyData =
+                        new uint8_t[storageMap->getSize(BRIG_SEGMENT_READONLY)];
+
+                    if (p->init) {
+                        if ((p->type == BRIG_TYPE_ROIMG) ||
+                            (p->type == BRIG_TYPE_WOIMG) ||
+                            (p->type == BRIG_TYPE_SAMP) ||
+                            (p->type == BRIG_TYPE_SIG32) ||
+                            (p->type == BRIG_TYPE_SIG64)) {
+                            panic("Read only data type not supported: %s\n",
+                                  getString(p->name));
+                        }
+
+                        const BrigOperand *brigOp = getOperand(p->init);
+                        assert(brigOp->kind ==
+                               BRIG_KIND_OPERAND_CONSTANT_BYTES);
+
+                        const Brig::BrigData *operand_data M5_VAR_USED =
+                            getBrigBaseData(((BrigOperandConstantBytes*)
+                                            brigOp)->bytes);
+
+                        assert((operand_data->byteCount / 4) > 0);
+
+                        uint8_t *symbol_data =
+                            (uint8_t*)getData(((BrigOperandConstantBytes*)
+                                              brigOp)->bytes + 4);
+
+                        // copy the old data and add the new data
+                        if (readonlySize_old > 0) {
+                            memcpy(readonlyData, readonlyData_old,
+                                   readonlySize_old);
+                        }
+
+                        memcpy(readonlyData + se->offset, symbol_data,
+                               se->size);
+
+                        delete[] readonlyData_old;
+                   }
+                }
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_LABEL:
+            {
+              const BrigDirectiveLabel M5_VAR_USED *p =
+                    reinterpret_cast<const BrigDirectiveLabel*>(dirPtr);
+
+              panic("Label directives cannot be at the module level: %s\n",
+                    getString(p->name));
+
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_COMMENT:
+            {
+              const BrigDirectiveComment M5_VAR_USED *p =
+                  reinterpret_cast<const BrigDirectiveComment*>(dirPtr);
+
+              DPRINTF(HSAILObject, "DIRECTIVE_COMMENT: %s\n",
+                      getString(p->name));
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_LOC:
+            {
+                DPRINTF(HSAILObject, "BRIG_DIRECTIVE_LOC\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_MODULE:
+            {
+                const BrigDirectiveModule M5_VAR_USED *p =
+                    reinterpret_cast<const BrigDirectiveModule*>(dirPtr);
+
+                DPRINTF(HSAILObject, "BRIG_DIRECTIVE_MODULE: %s\n",
+                        getString(p->name));
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_CONTROL:
+            {
+                DPRINTF(HSAILObject, "DIRECTIVE_CONTROL\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_PRAGMA:
+            {
+                DPRINTF(HSAILObject, "DIRECTIVE_PRAGMA\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_EXTENSION:
+            {
+                DPRINTF(HSAILObject, "DIRECTIVE_EXTENSION\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START:
+            {
+                DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_START\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END:
+            {
+                DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_END\n");
+            }
+            break;
+          default:
+            if (dirPtr->kind >= BRIG_KIND_INST_BEGIN &&
+                dirPtr->kind <= BRIG_KIND_INST_END)
+                break;
+
+            if (dirPtr->kind >= BRIG_KIND_OPERAND_BEGIN &&
+                dirPtr->kind <= BRIG_KIND_OPERAND_END)
+                break;
+
+            warn("Unknown Brig directive kind: %d\n", dirPtr->kind);
+            break;
+        }
+
+        dirPtr = nextDirPtr;
+    }
+}
+
+HsaObject*
+BrigObject::tryFile(const std::string &fname, int len, uint8_t *fileData)
+{
+    const char *brig_ident = "HSA BRIG";
+
+    if (memcmp(brig_ident, fileData, MODULE_IDENTIFICATION_LENGTH))
+        return nullptr;
+
+    return new BrigObject(fname, len, fileData);
+}
+
+BrigObject::BrigObject(const std::string &fname, int len, uint8_t *fileData)
+    : HsaObject(fname), storageMap(new StorageMap())
+{
+    const char *brig_ident = "HSA BRIG";
+    BrigModuleHeader *mod_hdr = (BrigModuleHeader*)fileData;
+
+    fatal_if(memcmp(brig_ident, mod_hdr, MODULE_IDENTIFICATION_LENGTH),
+             "%s is not a BRIG file\n", fname);
+
+    if (mod_hdr->brigMajor != BRIG_VERSION_BRIG_MAJOR ||
+        mod_hdr->brigMinor != BRIG_VERSION_BRIG_MINOR) {
+        fatal("%s: BRIG version mismatch, %d.%d != %d.%d\n",
+              fname, mod_hdr->brigMajor, mod_hdr->brigMinor,
+              BRIG_VERSION_BRIG_MAJOR, BRIG_VERSION_BRIG_MINOR);
+    }
+
+    fatal_if(mod_hdr->sectionCount != NumSectionIndices, "%s: BRIG section "
+             "count (%d) != expected value (%d)\n", fname,
+             mod_hdr->sectionCount, NumSectionIndices);
+
+    for (int i = 0; i < NumSectionIndices; ++i) {
+        sectionInfo[i].ptr = nullptr;
+    }
+
+    uint64_t *sec_idx_table = (uint64_t*)(fileData + mod_hdr->sectionIndex);
+    for (int sec_idx = 0; sec_idx < mod_hdr->sectionCount; ++sec_idx) {
+        uint8_t *sec_hdr_byte_ptr = fileData + sec_idx_table[sec_idx];
+        BrigSectionHeader *sec_hdr = (BrigSectionHeader*)sec_hdr_byte_ptr;
+
+        // It doesn't look like cprintf supports string precision values,
+        // but if this breaks, the right answer is to fix that
+        DPRINTF(HSAILObject, "found section %.*s\n", sec_hdr->nameLength,
+                sec_hdr->name);
+
+        sectionInfo[sec_idx].ptr = new uint8_t[sec_hdr->byteCount];
+        memcpy(sectionInfo[sec_idx].ptr, sec_hdr_byte_ptr, sec_hdr->byteCount);
+        sectionInfo[sec_idx].size = sec_hdr->byteCount;
+    }
+
+    BrigSectionHeader *code_hdr =
+        (BrigSectionHeader*)sectionInfo[CodeSectionIndex].ptr;
+
+    DPRINTF(HSAILObject, "Code section hdr, count: %d, hdr count: %d, "
+            "name len: %d\n", code_hdr->byteCount, code_hdr->headerByteCount,
+            code_hdr->nameLength);
+
+    // start at offset 4 to skip initial null entry (see Brig spec)
+    processDirectives(getCodeSectionEntry(code_hdr->headerByteCount),
+                      getCodeSectionEntry(sectionInfo[CodeSectionIndex].size),
+                      storageMap);
+
+    delete[] fileData;
+
+    DPRINTF(HSALoader, "BRIG object %s loaded.\n", fname);
+}
+
+BrigObject::~BrigObject()
+{
+    for (int i = 0; i < NumSectionIndices; ++i)
+        if (sectionInfo[i].ptr)
+            delete[] sectionInfo[i].ptr;
+}
diff --git a/src/gpu-compute/brig_object.hh b/src/gpu-compute/brig_object.hh
new file mode 100644
index 000000000..59a585914
--- /dev/null
+++ b/src/gpu-compute/brig_object.hh
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt, Anthony Gutierrez
+ */
+
+#ifndef __BRIG_OBJECT_HH__
+#define __BRIG_OBJECT_HH__
+
+#include <cassert>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "arch/hsail/Brig.h"
+#include "gpu-compute/hsa_object.hh"
+#include "gpu-compute/hsail_code.hh"
+
+class LabelMap;
+class StorageMap;
+
+/* @class BrigObject
+ * this class implements the BRIG loader object, and
+ * is used when the simulator directly executes HSAIL.
+ * this class is responsible for extracting all
+ * information about kernels contained in BRIG format
+ * and converts them to HsailCode objects that are
+ * usable by the simulator and emulated runtime.
+ */
+
+class BrigObject final : public HsaObject
+{
+  public:
+    enum SectionIndex
+    {
+        DataSectionIndex,
+        CodeSectionIndex,
+        OperandsSectionIndex,
+        NumSectionIndices
+    };
+
+    static const char *sectionNames[];
+
+    struct SectionInfo
+    {
+        uint8_t *ptr;
+        int size;
+    };
+
+    static HsaObject* tryFile(const std::string &fname, int len,
+                              uint8_t *fileData);
+
+    SectionInfo sectionInfo[NumSectionIndices];
+    const uint8_t *getSectionOffset(enum SectionIndex sec, int offs) const;
+
+    std::vector<HsailCode*> kernels;
+    std::vector<HsailCode*> functions;
+    std::string kern_block_name;
+
+    void processDirectives(const Brig::BrigBase *dirPtr,
+                           const Brig::BrigBase *endPtr,
+                           StorageMap *storageMap);
+
+    BrigObject(const std::string &fname, int len, uint8_t *fileData);
+    ~BrigObject();
+
+    // eventually these will need to be per-kernel not per-object-file
+    StorageMap *storageMap;
+    LabelMap *labelMap;
+
+    const char* getString(int offs) const;
+    const Brig::BrigData* getBrigBaseData(int offs) const;
+    const uint8_t* getData(int offs) const;
+    const Brig::BrigBase* getCodeSectionEntry(int offs) const;
+    const Brig::BrigOperand* getOperand(int offs) const;
+    unsigned getOperandPtr(int offs, int index) const;
+    const Brig::BrigInstBase* getInst(int offs) const;
+
+    HsaCode* getKernel(const std::string &name) const override;
+    HsaCode* getFunction(const std::string &name) const override;
+
+    int numKernels() const override { return kernels.size(); }
+
+    HsaCode* getKernel(int i) const override { return kernels[i]; }
+
+    // pointer to the current kernel/function we're processing, so elements
+    // under construction can reference it.  kinda ugly, but easier
+    // than passing it all over for the few places it's needed.
+    mutable HsailCode *currentCode;
+};
+
+// Utility function to bump Brig item pointer to next element given
+// item size in bytes.  Really just an add but with lots of casting.
+template<typename T>
+T*
+brigNext(T *ptr)
+{
+    Brig::BrigBase *base_ptr = (Brig::BrigBase*)ptr;
+    int size = base_ptr->byteCount;
+    assert(size);
+
+    return (T*)((uint8_t*)ptr + size);
+}
+
+#endif // __BRIG_OBJECT_HH__
diff --git a/src/gpu-compute/cl_driver.cc b/src/gpu-compute/cl_driver.cc
new file mode 100644
index 000000000..3b3291c03
--- /dev/null
+++ b/src/gpu-compute/cl_driver.cc
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/cl_driver.hh"
+
+#include "base/intmath.hh"
+#include "cpu/thread_context.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/hsa_code.hh"
+#include "gpu-compute/hsa_kernel_info.hh"
+#include "gpu-compute/hsa_object.hh"
+#include "params/ClDriver.hh"
+#include "sim/process.hh"
+#include "sim/syscall_emul_buf.hh"
+
+ClDriver::ClDriver(ClDriverParams *p)
+    : EmulatedDriver(p), hsaCode(0)
+{
+    for (const auto &codeFile : p->codefile)
+        codeFiles.push_back(&codeFile);
+
+    maxFuncArgsSize = 0;
+
+    for (int i = 0; i < codeFiles.size(); ++i) {
+        HsaObject *obj = HsaObject::createHsaObject(*codeFiles[i]);
+
+        for (int k = 0; k < obj->numKernels(); ++k) {
+            assert(obj->getKernel(k));
+            kernels.push_back(obj->getKernel(k));
+            kernels.back()->setReadonlyData((uint8_t*)obj->readonlyData);
+            int kern_funcargs_size = kernels.back()->funcarg_size;
+            maxFuncArgsSize = maxFuncArgsSize < kern_funcargs_size ?
+                kern_funcargs_size : maxFuncArgsSize;
+        }
+    }
+
+    int name_offs = 0;
+    int code_offs = 0;
+
+    for (int i = 0; i < kernels.size(); ++i) {
+        kernelInfo.push_back(HsaKernelInfo());
+        HsaCode *k = kernels[i];
+
+        k->generateHsaKernelInfo(&kernelInfo[i]);
+
+        kernelInfo[i].name_offs = name_offs;
+        kernelInfo[i].code_offs = code_offs;
+
+        name_offs += k->name().size() + 1;
+        code_offs += k->numInsts() * sizeof(GPUStaticInst*);
+    }
+}
+
+void
+ClDriver::handshake(GpuDispatcher *_dispatcher)
+{
+    dispatcher = _dispatcher;
+    dispatcher->setFuncargsSize(maxFuncArgsSize);
+}
+
+int
+ClDriver::open(LiveProcess *p, ThreadContext *tc, int mode, int flags)
+{
+    int fd = p->allocFD(-1, filename, 0, 0, false);
+    FDEntry *fde = p->getFDEntry(fd);
+    fde->driver = this;
+
+    return fd;
+}
+
+int
+ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req)
+{
+    int index = 2;
+    Addr buf_addr = process->getSyscallArg(tc, index);
+
+    switch (req) {
+      case HSA_GET_SIZES:
+        {
+            TypedBufferArg<HsaDriverSizes> sizes(buf_addr);
+            sizes->num_kernels = kernels.size();
+            sizes->string_table_size = 0;
+            sizes->code_size = 0;
+            sizes->readonly_size = 0;
+
+            if (kernels.size() > 0) {
+                // all kernels will share the same read-only memory
+                sizes->readonly_size =
+                    kernels[0]->getSize(HsaCode::MemorySegment::READONLY);
+                // check our assumption
+                for (int i = 1; i<kernels.size(); ++i) {
+                    assert(sizes->readonly_size ==
+                    kernels[i]->getSize(HsaCode::MemorySegment::READONLY));
+                }
+            }
+
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaCode *k = kernels[i];
+                // add one for terminating '\0'
+                sizes->string_table_size += k->name().size() + 1;
+                sizes->code_size += k->numInsts() * sizeof(GPUStaticInst*);
+            }
+
+            sizes.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_KINFO:
+        {
+            TypedBufferArg<HsaKernelInfo>
+                kinfo(buf_addr, sizeof(HsaKernelInfo) * kernels.size());
+
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaKernelInfo *ki = &kinfo[i];
+                ki->name_offs = kernelInfo[i].name_offs;
+                ki->code_offs = kernelInfo[i].code_offs;
+                ki->sRegCount = kernelInfo[i].sRegCount;
+                ki->dRegCount = kernelInfo[i].dRegCount;
+                ki->cRegCount = kernelInfo[i].cRegCount;
+                ki->static_lds_size  = kernelInfo[i].static_lds_size;
+                ki->private_mem_size = kernelInfo[i].private_mem_size;
+                ki->spill_mem_size   = kernelInfo[i].spill_mem_size;
+            }
+
+            kinfo.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_STRINGS:
+        {
+            int string_table_size = 0;
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaCode *k = kernels[i];
+                string_table_size += k->name().size() + 1;
+            }
+
+            BufferArg buf(buf_addr, string_table_size);
+            char *bufp = (char*)buf.bufferPtr();
+
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaCode *k = kernels[i];
+                const char *n = k->name().c_str();
+
+                // idiomatic string copy
+                while ((*bufp++ = *n++));
+            }
+
+            assert(bufp - (char *)buf.bufferPtr() == string_table_size);
+
+            buf.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_READONLY_DATA:
+        {
+            // we can pick any kernel --- they share the same
+            // readonly segment (this assumption is checked in GET_SIZES)
+            uint64_t size =
+                kernels.back()->getSize(HsaCode::MemorySegment::READONLY);
+            BufferArg data(buf_addr, size);
+            char *datap = (char *)data.bufferPtr();
+            memcpy(datap,
+                   kernels.back()->readonly_data,
+                   size);
+            data.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_CODE:
+        {
+            // set hsaCode pointer
+            hsaCode = buf_addr;
+            int code_size = 0;
+
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaCode *k = kernels[i];
+                code_size += k->numInsts() * sizeof(TheGpuISA::RawMachInst);
+            }
+
+            TypedBufferArg<TheGpuISA::RawMachInst> buf(buf_addr, code_size);
+            TheGpuISA::RawMachInst *bufp = buf;
+
+            int buf_idx = 0;
+
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaCode *k = kernels[i];
+
+                for (int j = 0; j < k->numInsts(); ++j) {
+                    bufp[buf_idx] = k->insts()->at(j);
+                    ++buf_idx;
+                }
+            }
+
+            buf.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_CU_CNT:
+        {
+            BufferArg buf(buf_addr, sizeof(uint32_t));
+            *((uint32_t*)buf.bufferPtr()) = dispatcher->getNumCUs();
+            buf.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_VSZ:
+        {
+            BufferArg buf(buf_addr, sizeof(uint32_t));
+            *((uint32_t*)buf.bufferPtr()) = VSZ;
+            buf.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      default:
+        fatal("ClDriver: bad ioctl %d\n", req);
+    }
+
+    return 0;
+}
+
+const char*
+ClDriver::codeOffToKernelName(uint64_t code_ptr)
+{
+    assert(hsaCode);
+    uint32_t code_offs = code_ptr - hsaCode;
+
+    for (int i = 0; i < kernels.size(); ++i) {
+        if (code_offs == kernelInfo[i].code_offs) {
+            return kernels[i]->name().c_str();
+        }
+    }
+
+    return nullptr;
+}
+
+ClDriver*
+ClDriverParams::create()
+{
+    return new ClDriver(this);
+}
diff --git a/src/gpu-compute/cl_driver.hh b/src/gpu-compute/cl_driver.hh
new file mode 100644
index 000000000..03567bab5
--- /dev/null
+++ b/src/gpu-compute/cl_driver.hh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __CL_DRIVER_HH__
+#define __CL_DRIVER_HH__
+
+#include <vector>
+
+#include "gpu-compute/hsa_kernel_info.hh"
+#include "sim/emul_driver.hh"
+
+class GpuDispatcher;
+class HsaCode;
+class LiveProcess;
+class ThreadContext;
+
+struct ClDriverParams;
+
+class ClDriver final : public EmulatedDriver
+{
+  public:
+    ClDriver(ClDriverParams *p);
+    void handshake(GpuDispatcher *_dispatcher);
+    int open(LiveProcess *p, ThreadContext *tc, int mode, int flags);
+    int ioctl(LiveProcess *p, ThreadContext *tc, unsigned req);
+    const char* codeOffToKernelName(uint64_t code_ptr);
+
+  private:
+    GpuDispatcher *dispatcher;
+
+    std::vector<const std::string*> codeFiles;
+
+    // All the kernels we know about
+    std::vector<HsaCode*> kernels;
+    std::vector<HsaCode*> functions;
+
+    std::vector<HsaKernelInfo> kernelInfo;
+
+    // maximum size necessary for function arguments
+    int maxFuncArgsSize;
+    // The host virtual address for the kernel code
+    uint64_t hsaCode;
+};
+
+#endif // __CL_DRIVER_HH__
diff --git a/src/gpu-compute/cl_event.hh b/src/gpu-compute/cl_event.hh
new file mode 100644
index 000000000..75297a2d2
--- /dev/null
+++ b/src/gpu-compute/cl_event.hh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Marc Orr
+ */
+
+#ifndef __GPU_CL_EVENT_HH__
+#define __GPU_CL_EVENT_HH__
+
+struct HsaQueueEntry;
+
+class _cl_event {
+  public:
+    _cl_event() : done(false), hsaTaskPtr(nullptr), start(0), end(0) { }
+
+    volatile bool done;
+    HsaQueueEntry *hsaTaskPtr;
+    uint64_t start;
+    uint64_t end;
+};
+
+#endif // __GPU_CL_EVENT_HH__
diff --git a/src/gpu-compute/code_enums.hh b/src/gpu-compute/code_enums.hh
new file mode 100644
index 000000000..126cf6c50
--- /dev/null
+++ b/src/gpu-compute/code_enums.hh
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __CODE_ENUMS_HH__
+#define __CODE_ENUMS_HH__
+
+#define IS_OT_GLOBAL(a) ((a)>=Enums::OT_GLOBAL_READ \
+                    && (a)<=Enums::OT_GLOBAL_LDAS)
+#define IS_OT_SHARED(a) ((a)>=Enums::OT_SHARED_READ \
+                    && (a)<=Enums::OT_SHARED_LDAS)
+#define IS_OT_PRIVATE(a) ((a)>=Enums::OT_PRIVATE_READ \
+                    && (a)<=Enums::OT_PRIVATE_LDAS)
+#define IS_OT_SPILL(a) ((a)>=Enums::OT_SPILL_READ \
+                    && (a)<=Enums::OT_SPILL_LDAS)
+#define IS_OT_READONLY(a) ((a)>=Enums::OT_READONLY_READ \
+                    && (a)<=Enums::OT_READONLY_LDAS)
+#define IS_OT_FLAT(a) ((a)>=Enums::OT_FLAT_READ && (a)<=Enums::OT_FLAT_LDAS)
+
+#define IS_OT_LDAS(a) ((a)==Enums::OT_GLOBAL_LDAS||(a)==Enums::OT_SHARED_LDAS \
+                    ||(a)==Enums::OT_PRIVATE_LDAS||(a)==Enums::OT_SPILL_LDAS \
+                    ||(a)==Enums::OT_READONLY_LDAS||(a)==Enums::OT_FLAT_LDAS)
+
+#define IS_OT_READ(a) ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SHARED_READ \
+                    ||(a)==Enums::OT_PRIVATE_READ||(a)==Enums::OT_SPILL_READ \
+                    ||(a)==Enums::OT_READONLY_READ||(a)==Enums::OT_FLAT_READ)
+
+#define IS_OT_READ_GM(a) \
+    ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SPILL_READ \
+    ||(a)==Enums::OT_READONLY_READ)
+
+#define IS_OT_READ_LM(a) ((a)==Enums::OT_SHARED_READ)
+
+#define IS_OT_READ_RM(a) ((a)==Enums::OT_READONLY_READ)
+
+#define IS_OT_READ_PM(a) ((a)==Enums::OT_PRIVATE_READ)
+
+#define IS_OT_WRITE(a) \
+    ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SHARED_WRITE \
+    ||(a)==Enums::OT_PRIVATE_WRITE||(a)==Enums::OT_SPILL_WRITE \
+    ||(a)==Enums::OT_READONLY_WRITE||(a)==Enums::OT_FLAT_WRITE)
+
+#define IS_OT_WRITE_GM(a) \
+    ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SPILL_WRITE \
+    ||(a)==Enums::OT_READONLY_WRITE)
+
+#define IS_OT_WRITE_LM(a) ((a)==Enums::OT_SHARED_WRITE)
+
+#define IS_OT_WRITE_PM(a) ((a)==Enums::OT_PRIVATE_WRITE)
+
+#define IS_OT_ATOMIC(a) ((a)==Enums::OT_GLOBAL_ATOMIC \
+                    ||(a)==Enums::OT_SHARED_ATOMIC \
+                    ||(a)==Enums::OT_PRIVATE_ATOMIC \
+                    ||(a)==Enums::OT_SPILL_ATOMIC \
+                    ||(a)==Enums::OT_READONLY_ATOMIC \
+                    ||(a)==Enums::OT_FLAT_ATOMIC)
+
+#define IS_OT_ATOMIC_GM(a) ((a)==Enums::OT_GLOBAL_ATOMIC \
+                    ||(a)==Enums::OT_SPILL_ATOMIC \
+                    ||(a)==Enums::OT_READONLY_ATOMIC \
+                    ||(a)==Enums::OT_GLOBAL_MEMFENCE \
+                    ||(a)==Enums::OT_BOTH_MEMFENCE)
+
+#define IS_OT_ATOMIC_LM(a) ((a)==Enums::OT_SHARED_ATOMIC \
+                    ||(a)==Enums::OT_SHARED_MEMFENCE \
+                    ||(a)==Enums::OT_BOTH_MEMFENCE)
+
+#define IS_OT_ATOMIC_PM(a) ((a)==Enums::OT_PRIVATE_ATOMIC)
+
+#define IS_OT_HIST(a) ((a)==Enums::OT_GLOBAL_HIST \
+                    ||(a)==Enums::OT_SHARED_HIST \
+                    ||(a)==Enums::OT_PRIVATE_HIST \
+                    ||(a)==Enums::OT_SPILL_HIST \
+                    ||(a)==Enums::OT_READONLY_HIST \
+                    ||(a)==Enums::OT_FLAT_HIST)
+
+#define IS_OT_HIST_GM(a) ((a)==Enums::OT_GLOBAL_HIST \
+                    ||(a)==Enums::OT_SPILL_HIST \
+                    ||(a)==Enums::OT_READONLY_HIST)
+
+#define IS_OT_HIST_LM(a) ((a)==Enums::OT_SHARED_HIST)
+
+#define IS_OT_HIST_PM(a) ((a)==Enums::OT_PRIVATE_HIST)
+
+#endif // __CODE_ENUMS_HH__
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
new file mode 100644
index 000000000..d3622007a
--- /dev/null
+++ b/src/gpu-compute/compute_unit.cc
@@ -0,0 +1,1817 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Anthony Gutierrez
+ */
+
+#include "gpu-compute/compute_unit.hh"
+
+#include "base/output.hh"
+#include "debug/GPUDisp.hh"
+#include "debug/GPUExec.hh"
+#include "debug/GPUFetch.hh"
+#include "debug/GPUMem.hh"
+#include "debug/GPUPort.hh"
+#include "debug/GPUPrefetch.hh"
+#include "debug/GPUSync.hh"
+#include "debug/GPUTLB.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/ndrange.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/simple_pool_manager.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/page_table.hh"
+#include "sim/process.hh"
+
+ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
+    scoreboardCheckStage(p), scheduleStage(p), execStage(p),
+    globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0),
+    cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs),
+    spBypassPipeLength(p->spbypass_pipe_length),
+    dpBypassPipeLength(p->dpbypass_pipe_length),
+    issuePeriod(p->issue_period),
+    numGlbMemUnits(p->num_global_mem_pipes),
+    numLocMemUnits(p->num_shared_mem_pipes),
+    perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
+    prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
+    xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault),
+    functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
+    countPages(p->countPages), barrier_id(0),
+    vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
+    coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
+    req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
+    resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
+    _masterId(p->system->getMasterId(name() + ".ComputeUnit")),
+    lds(*p->localDataStore), globalSeqNum(0),  wavefrontSize(p->wfSize)
+{
+    // this check will be eliminated once we have wavefront size support added
+    fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ");
+    // calculate how many cycles a vector load or store will need to transfer
+    // its data over the corresponding buses
+    numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t))
+                                / (double)vrfToCoalescerBusWidth);
+
+    numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t))
+                               / coalescerToVrfBusWidth;
+
+    lastVaddrWF.resize(numSIMDs);
+    wfList.resize(numSIMDs);
+
+    for (int j = 0; j < numSIMDs; ++j) {
+        lastVaddrWF[j].resize(p->n_wf);
+
+        for (int i = 0; i < p->n_wf; ++i) {
+            lastVaddrWF[j][i].resize(VSZ);
+
+            wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
+            wfList[j][i]->setParent(this);
+
+            for (int k = 0; k < VSZ; ++k) {
+                lastVaddrWF[j][i][k] = 0;
+            }
+        }
+    }
+
+    lastVaddrPhase.resize(numSIMDs);
+
+    for (int i = 0; i < numSIMDs; ++i) {
+        lastVaddrPhase[i] = LastVaddrWave();
+    }
+
+    lastVaddrCU = LastVaddrWave();
+
+    lds.setParent(this);
+
+    if (p->execPolicy == "OLDEST-FIRST") {
+        exec_policy = EXEC_POLICY::OLDEST;
+    } else if (p->execPolicy == "ROUND-ROBIN") {
+        exec_policy = EXEC_POLICY::RR;
+    } else {
+        fatal("Invalid WF execution policy (CU)\n");
+    }
+
+    memPort.resize(VSZ);
+
+    // resize the tlbPort vectorArray
+    int tlbPort_width = perLaneTLB ? VSZ : 1;
+    tlbPort.resize(tlbPort_width);
+
+    cuExitCallback = new CUExitCallback(this);
+    registerExitCallback(cuExitCallback);
+
+    xactCasLoadMap.clear();
+    lastExecCycle.resize(numSIMDs, 0);
+
+    for (int i = 0; i < vrf.size(); ++i) {
+        vrf[i]->setParent(this);
+    }
+
+    numVecRegsPerSimd = vrf[0]->numRegs();
+}
+
+ComputeUnit::~ComputeUnit()
+{
+    // Delete wavefront slots
+
+    for (int j = 0; j < numSIMDs; ++j)
+        for (int i = 0; i < shader->n_wf; ++i) {
+            delete wfList[j][i];
+        }
+
+    readyList.clear();
+    waveStatusList.clear();
+    dispatchList.clear();
+    vectorAluInstAvail.clear();
+    delete cuExitCallback;
+    delete ldsPort;
+}
+
+void
+ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr)
+{
+    w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
+
+    w->workgroupsz[0] = ndr->q.wgSize[0];
+    w->workgroupsz[1] = ndr->q.wgSize[1];
+    w->workgroupsz[2] = ndr->q.wgSize[2];
+    w->wg_sz = w->workgroupsz[0] * w->workgroupsz[1] * w->workgroupsz[2];
+    w->gridsz[0] = ndr->q.gdSize[0];
+    w->gridsz[1] = ndr->q.gdSize[1];
+    w->gridsz[2] = ndr->q.gdSize[2];
+    w->kernelArgs = ndr->q.args;
+    w->privSizePerItem = ndr->q.privMemPerItem;
+    w->spillSizePerItem = ndr->q.spillMemPerItem;
+    w->roBase = ndr->q.roMemStart;
+    w->roSize = ndr->q.roMemTotal;
+}
+
+void
+ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
+                        int trueWgSize[], int trueWgSizeTotal,
+                        LdsChunk *ldsChunk, uint64_t origSpillMemStart)
+{
+    wfCtx->cnt = cnt;
+
+    VectorMask init_mask;
+    init_mask.reset();
+
+    for (int k = 0; k < VSZ; ++k) {
+        if (k + cnt * VSZ < trueWgSizeTotal)
+            init_mask[k] = 1;
+    }
+
+    wfCtx->init_mask = init_mask.to_ullong();
+    wfCtx->exec_mask = init_mask.to_ullong();
+
+    for (int i = 0; i < VSZ; ++i) {
+        wfCtx->bar_cnt[i] = 0;
+    }
+
+    wfCtx->max_bar_cnt = 0;
+    wfCtx->old_barrier_cnt = 0;
+    wfCtx->barrier_cnt = 0;
+
+    wfCtx->privBase = ndr->q.privMemStart;
+    ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ;
+
+    wfCtx->spillBase = ndr->q.spillMemStart;
+    ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ;
+
+    wfCtx->pc = 0;
+    wfCtx->rpc = UINT32_MAX;
+
+    // set the wavefront context to have a pointer to this section of the LDS
+    wfCtx->ldsChunk = ldsChunk;
+
+    // WG state
+    wfCtx->wg_id = ndr->globalWgId;
+    wfCtx->barrier_id = barrier_id;
+
+    // Kernel wide state
+    wfCtx->ndr = ndr;
+}
+
+void
+ComputeUnit::updateEvents() {
+
+    if (!timestampVec.empty()) {
+        uint32_t vecSize = timestampVec.size();
+        uint32_t i = 0;
+        while (i < vecSize) {
+            if (timestampVec[i] <= shader->tick_cnt) {
+                std::pair<uint32_t, uint32_t> regInfo = regIdxVec[i];
+                vrf[regInfo.first]->markReg(regInfo.second, sizeof(uint32_t),
+                                            statusVec[i]);
+                timestampVec.erase(timestampVec.begin() + i);
+                regIdxVec.erase(regIdxVec.begin() + i);
+                statusVec.erase(statusVec.begin() + i);
+                --vecSize;
+                --i;
+            }
+            ++i;
+        }
+    }
+
+    for (int i = 0; i< numSIMDs; ++i) {
+        vrf[i]->updateEvents();
+    }
+}
+
+
+void
+ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
+                     int trueWgSizeTotal)
+{
+    static int _n_wave = 0;
+    int cnt = wfCtx->cnt;
+    NDRange *ndr = wfCtx->ndr;
+
+    // Fill in Kernel state
+    FillKernelState(w, ndr);
+
+    w->kern_id = ndr->dispatchId;
+    w->dynwaveid = cnt;
+    w->init_mask = wfCtx->init_mask;
+
+    for (int k = 0; k < VSZ; ++k) {
+        w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0];
+        w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1];
+        w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]);
+
+        w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
+            trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
+            w->workitemid[0][k];
+    }
+
+    w->old_barrier_cnt = wfCtx->old_barrier_cnt;
+    w->barrier_cnt = wfCtx->barrier_cnt;
+    w->barrier_slots = divCeil(trueWgSizeTotal, VSZ);
+
+    for (int i = 0; i < VSZ; ++i) {
+        w->bar_cnt[i] = wfCtx->bar_cnt[i];
+    }
+
+    w->max_bar_cnt = wfCtx->max_bar_cnt;
+    w->privBase = wfCtx->privBase;
+    w->spillBase = wfCtx->spillBase;
+
+    w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask);
+
+    // WG state
+    w->wg_id = wfCtx->wg_id;
+    w->dispatchid = wfCtx->ndr->dispatchId;
+    w->workgroupid[0] = w->wg_id % ndr->numWg[0];
+    w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1];
+    w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]);
+
+    w->barrier_id = wfCtx->barrier_id;
+    w->stalledAtBarrier = false;
+
+    // move this from the context into the actual wavefront
+    w->ldsChunk = wfCtx->ldsChunk;
+
+    int32_t refCount M5_VAR_USED =
+                    lds.increaseRefCounter(w->dispatchid, w->wg_id);
+    DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
+                    cu_id, w->wg_id, refCount);
+
+    w->instructionBuffer.clear();
+
+    if (w->pendingFetch)
+        w->dropFetch = true;
+
+    // is this the last wavefront in the workgroup
+    // if set the spillWidth to be the remaining work-items
+    // so that the vector access is correct
+    if ((cnt + 1) * VSZ >= trueWgSizeTotal) {
+        w->spillWidth = trueWgSizeTotal - (cnt * VSZ);
+    } else {
+        w->spillWidth = VSZ;
+    }
+
+    DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
+            "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
+
+    w->start(++_n_wave, ndr->q.code_ptr);
+}
+
+void
+ComputeUnit::StartWorkgroup(NDRange *ndr)
+{
+    // reserve the LDS capacity allocated to the work group
+    // disambiguated by the dispatch ID and workgroup ID, which should be
+    // globally unique
+    LdsChunk *ldsChunk = lds.reserveSpace(ndr->dispatchId, ndr->globalWgId,
+                                          ndr->q.ldsSize);
+
+    // Send L1 cache acquire
+    // isKernel + isAcquire = Kernel Begin
+    if (shader->impl_kern_boundary_sync) {
+        GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr,
+                                                                nullptr,
+                                                                nullptr, 0);
+
+        gpuDynInst->useContinuation = false;
+        gpuDynInst->memoryOrder = Enums::MEMORY_ORDER_SC_ACQUIRE;
+        gpuDynInst->scope = Enums::MEMORY_SCOPE_SYSTEM;
+        injectGlobalMemFence(gpuDynInst, true);
+    }
+
+    // Get true size of workgroup (after clamping to grid size)
+    int trueWgSize[3];
+    int trueWgSizeTotal = 1;
+
+    for (int d = 0; d < 3; ++d) {
+        trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
+                                 ndr->wgId[d] * ndr->q.wgSize[d]);
+
+        trueWgSizeTotal *= trueWgSize[d];
+    }
+
+    uint64_t origSpillMemStart = ndr->q.spillMemStart;
+    // calculate the number of 32-bit vector registers required by wavefront
+    int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
+    int cnt = 0;
+
+    // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
+    for (int m = 0; m < shader->n_wf * numSIMDs; ++m) {
+        Wavefront *w = wfList[m % numSIMDs][m / numSIMDs];
+        // Check if this wavefront slot is available:
+        // It must be stopped and not waiting
+        // for a release to complete S_RETURNING
+        if (w->status == Wavefront::S_STOPPED) {
+            // if we have scheduled all work items then stop
+            // scheduling wavefronts
+            if (cnt * VSZ >= trueWgSizeTotal)
+                break;
+
+            // reserve vector registers for the scheduled wavefront
+            assert(vectorRegsReserved[m % numSIMDs] <= numVecRegsPerSimd);
+            uint32_t normSize = 0;
+
+            w->startVgprIndex = vrf[m % numSIMDs]->manager->
+                                    allocateRegion(vregDemand, &normSize);
+
+            w->reservedVectorRegs = normSize;
+            vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
+
+            WFContext wfCtx;
+
+            InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal,
+                                ldsChunk, origSpillMemStart);
+
+            StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal);
+            ++cnt;
+        }
+    }
+    ++barrier_id;
+}
+
+int
+ComputeUnit::ReadyWorkgroup(NDRange *ndr)
+{
+    // Get true size of workgroup (after clamping to grid size)
+    int trueWgSize[3];
+    int trueWgSizeTotal = 1;
+
+    for (int d = 0; d < 3; ++d) {
+        trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
+                                 ndr->wgId[d] * ndr->q.wgSize[d]);
+
+        trueWgSizeTotal *= trueWgSize[d];
+        DPRINTF(GPUDisp, "trueWgSize[%d] =  %d\n", d, trueWgSize[d]);
+    }
+
+    DPRINTF(GPUDisp, "trueWgSizeTotal =  %d\n", trueWgSizeTotal);
+
+    // calculate the number of 32-bit vector registers required by each
+    // work item of the work group
+    int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
+    bool vregAvail = true;
+    int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ;
+    int freeWfSlots = 0;
+    // check if the total number of VGPRs required by all WFs of the WG
+    // fit in the VRFs of all SIMD units
+    assert((numWfs * vregDemandPerWI) <= (numSIMDs * numVecRegsPerSimd));
+    int numMappedWfs = 0;
+    std::vector<int> numWfsPerSimd;
+    numWfsPerSimd.resize(numSIMDs, 0);
+    // find how many free WF slots we have across all SIMDs
+    for (int j = 0; j < shader->n_wf; ++j) {
+        for (int i = 0; i < numSIMDs; ++i) {
+            if (wfList[i][j]->status == Wavefront::S_STOPPED) {
+                // count the number of free WF slots
+                ++freeWfSlots;
+                if (numMappedWfs < numWfs) {
+                    // count the WFs to be assigned per SIMD
+                    numWfsPerSimd[i]++;
+                }
+                numMappedWfs++;
+            }
+        }
+    }
+
+    // if there are enough free WF slots then find if there are enough
+    // free VGPRs per SIMD based on the WF->SIMD mapping
+    if (freeWfSlots >= numWfs) {
+        for (int j = 0; j < numSIMDs; ++j) {
+            // find if there are enough free VGPR regions in the SIMD's VRF
+            // to accommodate the WFs of the new WG that would be mapped to
+            // this SIMD unit
+            vregAvail = vrf[j]->manager->canAllocate(numWfsPerSimd[j],
+                                                     vregDemandPerWI);
+
+            // stop searching if there is at least one SIMD
+            // whose VRF does not have enough free VGPR pools.
+            // This is because a WG is scheduled only if ALL
+            // of its WFs can be scheduled
+            if (!vregAvail)
+                break;
+        }
+    }
+
+    DPRINTF(GPUDisp, "Free WF slots =  %d, VGPR Availability = %d\n",
+            freeWfSlots, vregAvail);
+
+    if (!vregAvail) {
+        ++numTimesWgBlockedDueVgprAlloc;
+    }
+
+    // Return true if enough WF slots to submit workgroup and if there are
+    // enough VGPRs to schedule all WFs to their SIMD units
+    if (!lds.canReserve(ndr->q.ldsSize)) {
+        wgBlockedDueLdsAllocation++;
+    }
+
+    // Return true if (a) there are enough free WF slots to submit
+    // workgrounp and (b) if there are enough VGPRs to schedule all WFs to their
+    // SIMD units and (c) if there is enough space in LDS
+    return freeWfSlots >= numWfs && vregAvail && lds.canReserve(ndr->q.ldsSize);
+}
+
+int
+ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
+{
+    DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id);
+    int ccnt = 0;
+
+    for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) {
+        for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
+            Wavefront *w = wfList[i_simd][i_wf];
+
+            if (w->status == Wavefront::S_RUNNING) {
+                DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
+
+                DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
+                        w->barrier_id, _barrier_id);
+
+                DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n",
+                        w->barrier_cnt, bcnt);
+            }
+
+            if (w->status == Wavefront::S_RUNNING &&
+                w->barrier_id == _barrier_id && w->barrier_cnt == bcnt &&
+                !w->outstanding_reqs) {
+                ++ccnt;
+
+                DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to "
+                        "%d\n", i_simd, i_wf, ccnt);
+            }
+        }
+    }
+
+    DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
+            cu_id, ccnt, bslots);
+
+    return ccnt == bslots;
+}
+
+//  Check if the current wavefront is blocked on additional resources.
+bool
+ComputeUnit::cedeSIMD(int simdId, int wfSlotId)
+{
+    bool cede = false;
+
+    // If --xact-cas-mode option is enabled in run.py, then xact_cas_ld
+    // magic instructions will impact the scheduling of wavefronts
+    if (xact_cas_mode) {
+        /*
+         * When a wavefront calls xact_cas_ld, it adds itself to a per address
+         * queue. All per address queues are managed by the xactCasLoadMap.
+         *
+         * A wavefront is not blocked if: it is not in ANY per address queue or
+         * if it is at the head of a per address queue.
+         */
+        for (auto itMap : xactCasLoadMap) {
+            std::list<waveIdentifier> curWaveIDQueue = itMap.second.waveIDQueue;
+
+            if (!curWaveIDQueue.empty()) {
+                for (auto it : curWaveIDQueue) {
+                    waveIdentifier cur_wave = it;
+
+                    if (cur_wave.simdId == simdId &&
+                        cur_wave.wfSlotId == wfSlotId) {
+                        // 2 possibilities
+                        // 1: this WF has a green light
+                        // 2: another WF has a green light
+                        waveIdentifier owner_wave = curWaveIDQueue.front();
+
+                        if (owner_wave.simdId != cur_wave.simdId ||
+                            owner_wave.wfSlotId != cur_wave.wfSlotId) {
+                            // possibility 2
+                            cede = true;
+                            break;
+                        } else {
+                            // possibility 1
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return cede;
+}
+
+// Execute one clock worth of work on the ComputeUnit.
+void
+ComputeUnit::exec()
+{
+    updateEvents();
+    // Execute pipeline stages in reverse order to simulate
+    // the pipeline latency
+    globalMemoryPipe.exec();
+    localMemoryPipe.exec();
+    execStage.exec();
+    scheduleStage.exec();
+    scoreboardCheckStage.exec();
+    fetchStage.exec();
+
+    totalCycles++;
+}
+
+void
+ComputeUnit::init()
+{
+    // Initialize CU Bus models
+    glbMemToVrfBus.init(&shader->tick_cnt, 1);
+    locMemToVrfBus.init(&shader->tick_cnt, 1);
+    nextGlbMemBus = 0;
+    nextLocMemBus = 0;
+    fatal_if(numGlbMemUnits > 1,
+             "No support for multiple Global Memory Pipelines exists!!!");
+    vrfToGlobalMemPipeBus.resize(numGlbMemUnits);
+    for (int j = 0; j < numGlbMemUnits; ++j) {
+        vrfToGlobalMemPipeBus[j] = WaitClass();
+        vrfToGlobalMemPipeBus[j].init(&shader->tick_cnt, 1);
+    }
+
+    fatal_if(numLocMemUnits > 1,
+             "No support for multiple Local Memory Pipelines exists!!!");
+    vrfToLocalMemPipeBus.resize(numLocMemUnits);
+    for (int j = 0; j < numLocMemUnits; ++j) {
+        vrfToLocalMemPipeBus[j] = WaitClass();
+        vrfToLocalMemPipeBus[j].init(&shader->tick_cnt, 1);
+    }
+    vectorRegsReserved.resize(numSIMDs, 0);
+    aluPipe.resize(numSIMDs);
+    wfWait.resize(numSIMDs + numLocMemUnits + numGlbMemUnits);
+
+    for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) {
+        wfWait[i] = WaitClass();
+        wfWait[i].init(&shader->tick_cnt, 1);
+    }
+
+    for (int i = 0; i < numSIMDs; ++i) {
+        aluPipe[i] = WaitClass();
+        aluPipe[i].init(&shader->tick_cnt, 1);
+    }
+
+    // Setup space for call args
+    for (int j = 0; j < numSIMDs; ++j) {
+        for (int i = 0; i < shader->n_wf; ++i) {
+            wfList[j][i]->initCallArgMem(shader->funcargs_size);
+        }
+    }
+
+    // Initializing pipeline resources
+    readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits);
+    waveStatusList.resize(numSIMDs);
+
+    for (int j = 0; j < numSIMDs; ++j) {
+        for (int i = 0; i < shader->n_wf; ++i) {
+            waveStatusList[j].push_back(
+                std::make_pair(wfList[j][i], BLOCKED));
+        }
+    }
+
+    for (int j = 0; j < (numSIMDs + numGlbMemUnits + numLocMemUnits); ++j) {
+        dispatchList.push_back(std::make_pair((Wavefront*)nullptr, EMPTY));
+    }
+
+    fetchStage.init(this);
+    scoreboardCheckStage.init(this);
+    scheduleStage.init(this);
+    execStage.init(this);
+    globalMemoryPipe.init(this);
+    localMemoryPipe.init(this);
+    // initialize state for statistics calculation
+    vectorAluInstAvail.resize(numSIMDs, false);
+    shrMemInstAvail = 0;
+    glbMemInstAvail = 0;
+}
+
+bool
+ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
+{
+    // Ruby has completed the memory op. Schedule the mem_resp_event at the
+    // appropriate cycle to process the timing memory response
+    // This delay represents the pipeline delay
+    SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+    int index = sender_state->port_index;
+    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+
+    // Is the packet returned a Kernel End or Barrier
+    if (pkt->req->isKernel() && pkt->req->isRelease()) {
+        Wavefront *w =
+            computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
+
+        // Check if we are waiting on Kernel End Release
+        if (w->status == Wavefront::S_RETURNING) {
+            DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
+                    computeUnit->cu_id, w->simdId, w->wfSlotId,
+                    w->wfDynId, w->kern_id);
+
+            computeUnit->shader->dispatcher->notifyWgCompl(w);
+            w->status = Wavefront::S_STOPPED;
+        } else {
+            w->outstanding_reqs--;
+        }
+
+        DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n",
+                computeUnit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, w->barrier_cnt);
+
+        if (gpuDynInst->useContinuation) {
+            assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+            gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+                                           gpuDynInst);
+        }
+
+        delete pkt->senderState;
+        delete pkt->req;
+        delete pkt;
+        return true;
+    } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
+        if (gpuDynInst->useContinuation) {
+            assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+            gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+                                           gpuDynInst);
+        }
+
+        delete pkt->senderState;
+        delete pkt->req;
+        delete pkt;
+        return true;
+    }
+
+    ComputeUnit::DataPort::MemRespEvent *mem_resp_event =
+        new ComputeUnit::DataPort::MemRespEvent(computeUnit->memPort[index],
+                                                pkt);
+
+    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
+            computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+            index, pkt->req->getPaddr());
+
+    computeUnit->schedule(mem_resp_event,
+                          curTick() + computeUnit->resp_tick_latency);
+    return true;
+}
+
+void
+ComputeUnit::DataPort::recvReqRetry()
+{
+    int len = retries.size();
+
+    assert(len > 0);
+
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = retries.front().first;
+        GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
+                computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+                pkt->req->getPaddr());
+
+        /** Currently Ruby can return false due to conflicts for the particular
+         *  cache block or address.  Thus other requests should be allowed to
+         *  pass and the data port should expect multiple retries. */
+        if (!sendTimingReq(pkt)) {
+            DPRINTF(GPUMem, "failed again!\n");
+            break;
+        } else {
+            DPRINTF(GPUMem, "successful!\n");
+            retries.pop_front();
+        }
+    }
+}
+
+bool
+ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
+{
+    computeUnit->fetchStage.processFetchReturn(pkt);
+
+    return true;
+}
+
+void
+ComputeUnit::SQCPort::recvReqRetry()
+{
+    int len = retries.size();
+
+    assert(len > 0);
+
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = retries.front().first;
+        Wavefront *wavefront M5_VAR_USED = retries.front().second;
+        DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
+                computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+                pkt->req->getPaddr());
+        if (!sendTimingReq(pkt)) {
+            DPRINTF(GPUFetch, "failed again!\n");
+            break;
+        } else {
+            DPRINTF(GPUFetch, "successful!\n");
+            retries.pop_front();
+        }
+    }
+}
+
+void
+ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
+{
+    // There must be a way around this check to do the globalMemStart...
+    Addr tmp_vaddr = pkt->req->getVaddr();
+
+    updatePageDivergenceDist(tmp_vaddr);
+
+    pkt->req->setVirt(pkt->req->getAsid(), tmp_vaddr, pkt->req->getSize(),
+                      pkt->req->getFlags(), pkt->req->masterId(),
+                      pkt->req->getPC());
+
+    // figure out the type of the request to set read/write
+    BaseTLB::Mode TLB_mode;
+    assert(pkt->isRead() || pkt->isWrite());
+
+    // Check write before read for atomic operations
+    // since atomic operations should use BaseTLB::Write
+    if (pkt->isWrite()){
+        TLB_mode = BaseTLB::Write;
+    } else if (pkt->isRead()) {
+        TLB_mode = BaseTLB::Read;
+    } else {
+        fatal("pkt is not a read nor a write\n");
+    }
+
+    tlbCycles -= curTick();
+    ++tlbRequests;
+
+    int tlbPort_index = perLaneTLB ? index : 0;
+
+    if (shader->timingSim) {
+        if (debugSegFault) {
+            Process *p = shader->gpuTc->getProcessPtr();
+            Addr vaddr = pkt->req->getVaddr();
+            unsigned size = pkt->getSize();
+
+            if ((vaddr + size - 1) % 64 < vaddr % 64) {
+                panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
+                      cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
+            }
+
+            Addr paddr;
+
+            if (!p->pTable->translate(vaddr, paddr)) {
+                if (!p->fixupStackFault(vaddr)) {
+                    panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
+                          cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+                          vaddr);
+                }
+            }
+        }
+
+        // This is the SenderState needed upon return
+        pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
+
+        // This is the senderState needed by the TLB hierarchy to function
+        TheISA::GpuTLB::TranslationState *translation_state =
+          new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
+                                               pkt->senderState);
+
+        pkt->senderState = translation_state;
+
+        if (functionalTLB) {
+            tlbPort[tlbPort_index]->sendFunctional(pkt);
+
+            // update the hitLevel distribution
+            int hit_level = translation_state->hitLevel;
+            assert(hit_level != -1);
+            hitsPerTLBLevel[hit_level]++;
+
+            // New SenderState for the memory access
+            X86ISA::GpuTLB::TranslationState *sender_state =
+                safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+            delete sender_state->tlbEntry;
+            delete sender_state->saved;
+            delete sender_state;
+
+            assert(pkt->req->hasPaddr());
+            assert(pkt->req->hasSize());
+
+            uint8_t *tmpData = pkt->getPtr<uint8_t>();
+
+            // this is necessary because the GPU TLB receives packets instead
+            // of requests. when the translation is complete, all relevent
+            // fields in the request will be populated, but not in the packet.
+            // here we create the new packet so we can set the size, addr,
+            // and proper flags.
+            PacketPtr oldPkt = pkt;
+            pkt = new Packet(oldPkt->req, oldPkt->cmd);
+            delete oldPkt;
+            pkt->dataStatic(tmpData);
+
+
+            // New SenderState for the memory access
+            pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst,
+                                                             index, nullptr);
+
+            gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
+            gpuDynInst->tlbHitLevel[index] = hit_level;
+
+
+            // translation is done. Schedule the mem_req_event at the
+            // appropriate cycle to send the timing memory request to ruby
+            ComputeUnit::DataPort::MemReqEvent *mem_req_event =
+                new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt);
+
+            DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
+                    "scheduled\n", cu_id, gpuDynInst->simdId,
+                    gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
+
+            schedule(mem_req_event, curTick() + req_tick_latency);
+        } else if (tlbPort[tlbPort_index]->isStalled()) {
+            assert(tlbPort[tlbPort_index]->retries.size() > 0);
+
+            DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
+                    "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+                    tmp_vaddr);
+
+            tlbPort[tlbPort_index]->retries.push_back(pkt);
+        } else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
+            // Stall the data port;
+            // No more packet will be issued till
+            // ruby indicates resources are freed by
+            // a recvReqRetry() call back on this port.
+            tlbPort[tlbPort_index]->stallPort();
+
+            DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
+                    "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+                    tmp_vaddr);
+
+            tlbPort[tlbPort_index]->retries.push_back(pkt);
+        } else {
+           DPRINTF(GPUTLB,
+                   "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
+                   cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
+        }
+    } else {
+        if (pkt->cmd == MemCmd::MemFenceReq) {
+            gpuDynInst->statusBitVector = VectorMask(0);
+        } else {
+            gpuDynInst->statusBitVector &= (~(1ll << index));
+        }
+
+        // New SenderState for the memory access
+        delete pkt->senderState;
+
+        // Because it's atomic operation, only need TLB translation state
+        pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode,
+                                                                shader->gpuTc);
+
+        tlbPort[tlbPort_index]->sendFunctional(pkt);
+
+        // the addr of the packet is not modified, so we need to create a new
+        // packet, or otherwise the memory access will have the old virtual
+        // address sent in the translation packet, instead of the physical
+        // address returned by the translation.
+        PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
+        new_pkt->dataStatic(pkt->getPtr<uint8_t>());
+
+        // Translation is done. It is safe to send the packet to memory.
+        memPort[0]->sendFunctional(new_pkt);
+
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
+                gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
+                new_pkt->req->getPaddr());
+
+        // safe_cast the senderState
+        TheISA::GpuTLB::TranslationState *sender_state =
+             safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+        delete sender_state->tlbEntry;
+        delete new_pkt;
+        delete pkt->senderState;
+        delete pkt->req;
+        delete pkt;
+    }
+}
+
+void
+ComputeUnit::sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
+{
+    ComputeUnit::DataPort::MemReqEvent *mem_req_event =
+        new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt);
+
+
+    // New SenderState for the memory access
+    pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
+                                                              nullptr);
+
+    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
+            cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
+            pkt->req->getPaddr());
+
+    schedule(mem_req_event, curTick() + req_tick_latency);
+}
+
+void
+ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch,
+                                  Request* req)
+{
+    if (!req) {
+        req = new Request(0, 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId, -1);
+    }
+    req->setPaddr(0);
+    if (kernelLaunch) {
+        req->setFlags(Request::KERNEL);
+    }
+
+    gpuDynInst->s_type = SEG_GLOBAL;
+
+    // for non-kernel MemFence operations, memorder flags are set depending
+    // on which type of request is currently being sent, so this
+    // should be set by the caller (e.g. if an inst has acq-rel
+    // semantics, it will send one acquire req an one release req)
+    gpuDynInst->setRequestFlags(req, kernelLaunch);
+
+    // a mem fence must correspond to an acquire/release request
+    assert(req->isAcquire() || req->isRelease());
+
+    // create packet
+    PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq);
+
+    // set packet's sender state
+    pkt->senderState =
+        new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr);
+
+    // send the packet
+    sendSyncRequest(gpuDynInst, 0, pkt);
+}
+
+const char*
+ComputeUnit::DataPort::MemRespEvent::description() const
+{
+    return "ComputeUnit memory response event";
+}
+
+void
+ComputeUnit::DataPort::MemRespEvent::process()
+{
+    DataPort::SenderState *sender_state =
+        safe_cast<DataPort::SenderState*>(pkt->senderState);
+
+    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+    ComputeUnit *compute_unit = dataPort->computeUnit;
+
+    assert(gpuDynInst);
+
+    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
+            compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+            pkt->req->getPaddr(), dataPort->index);
+
+    Addr paddr = pkt->req->getPaddr();
+
+    if (pkt->cmd != MemCmd::MemFenceResp) {
+        int index = gpuDynInst->memStatusVector[paddr].back();
+
+        DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
+                pkt->req->getPaddr(), index);
+
+        gpuDynInst->memStatusVector[paddr].pop_back();
+        gpuDynInst->pAddr = pkt->req->getPaddr();
+
+        if (pkt->isRead() || pkt->isWrite()) {
+
+            if (gpuDynInst->n_reg <= MAX_REGS_FOR_NON_VEC_MEM_INST) {
+                gpuDynInst->statusBitVector &= (~(1ULL << index));
+            } else {
+                assert(gpuDynInst->statusVector[index] > 0);
+                gpuDynInst->statusVector[index]--;
+
+                if (!gpuDynInst->statusVector[index])
+                    gpuDynInst->statusBitVector &= (~(1ULL << index));
+            }
+
+            DPRINTF(GPUMem, "bitvector is now %#x\n",
+                    gpuDynInst->statusBitVector);
+
+            if (gpuDynInst->statusBitVector == VectorMask(0)) {
+                auto iter = gpuDynInst->memStatusVector.begin();
+                auto end = gpuDynInst->memStatusVector.end();
+
+                while (iter != end) {
+                    assert(iter->second.empty());
+                    ++iter;
+                }
+
+                gpuDynInst->memStatusVector.clear();
+
+                if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
+                    gpuDynInst->statusVector.clear();
+
+                if (gpuDynInst->m_op == Enums::MO_LD || MO_A(gpuDynInst->m_op)
+                    || MO_ANR(gpuDynInst->m_op)) {
+                    assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy());
+
+                    compute_unit->globalMemoryPipe.getGMLdRespFIFO()
+                        .push(gpuDynInst);
+                } else {
+                    assert(compute_unit->globalMemoryPipe.isGMStRespFIFOWrRdy());
+
+                    compute_unit->globalMemoryPipe.getGMStRespFIFO()
+                        .push(gpuDynInst);
+                }
+
+                DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
+                        compute_unit->cu_id, gpuDynInst->simdId,
+                        gpuDynInst->wfSlotId);
+
+                // after clearing the status vectors,
+                // see if there is a continuation to perform
+                // the continuation may generate more work for
+                // this memory request
+                if (gpuDynInst->useContinuation) {
+                    assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+                    gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+                                                 gpuDynInst);
+                }
+            }
+        }
+    } else {
+        gpuDynInst->statusBitVector = VectorMask(0);
+
+        if (gpuDynInst->useContinuation) {
+            assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+            gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+                                         gpuDynInst);
+        }
+    }
+
+    delete pkt->senderState;
+    delete pkt->req;
+    delete pkt;
+}
+
+ComputeUnit*
+ComputeUnitParams::create()
+{
+    return new ComputeUnit(this);
+}
+
+bool
+ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
+{
+    Addr line = pkt->req->getPaddr();
+
+    DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
+            pkt->req->getVaddr(), line);
+
+    assert(pkt->senderState);
+    computeUnit->tlbCycles += curTick();
+
+    // pop off the TLB translation state
+    TheISA::GpuTLB::TranslationState *translation_state =
+               safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    // no PageFaults are permitted for data accesses
+    if (!translation_state->tlbEntry->valid) {
+        DTLBPort::SenderState *sender_state =
+            safe_cast<DTLBPort::SenderState*>(translation_state->saved);
+
+        Wavefront *w M5_VAR_USED =
+            computeUnit->wfList[sender_state->_gpuDynInst->simdId]
+            [sender_state->_gpuDynInst->wfSlotId];
+
+        DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
+                 pkt->req->getVaddr());
+    }
+
+    assert(translation_state->tlbEntry->valid);
+
+    // update the hitLevel distribution
+    int hit_level = translation_state->hitLevel;
+    computeUnit->hitsPerTLBLevel[hit_level]++;
+
+    delete translation_state->tlbEntry;
+    assert(!translation_state->ports.size());
+    pkt->senderState = translation_state->saved;
+
+    // for prefetch pkt
+    BaseTLB::Mode TLB_mode = translation_state->tlbMode;
+
+    delete translation_state;
+
+    // use the original sender state to know how to close this transaction
+    DTLBPort::SenderState *sender_state =
+        safe_cast<DTLBPort::SenderState*>(pkt->senderState);
+
+    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+    int mp_index = sender_state->portIndex;
+    Addr vaddr = pkt->req->getVaddr();
+    gpuDynInst->memStatusVector[line].push_back(mp_index);
+    gpuDynInst->tlbHitLevel[mp_index] = hit_level;
+
+    MemCmd requestCmd;
+
+    if (pkt->cmd == MemCmd::ReadResp) {
+        requestCmd = MemCmd::ReadReq;
+    } else if (pkt->cmd == MemCmd::WriteResp) {
+        requestCmd = MemCmd::WriteReq;
+    } else if (pkt->cmd == MemCmd::SwapResp) {
+        requestCmd = MemCmd::SwapReq;
+    } else {
+        panic("unsupported response to request conversion %s\n",
+              pkt->cmd.toString());
+    }
+
+    if (computeUnit->prefetchDepth) {
+        int simdId = gpuDynInst->simdId;
+        int wfSlotId = gpuDynInst->wfSlotId;
+        Addr last = 0;
+
+        switch(computeUnit->prefetchType) {
+          case Enums::PF_CU:
+            last = computeUnit->lastVaddrCU[mp_index];
+            break;
+          case Enums::PF_PHASE:
+            last = computeUnit->lastVaddrPhase[simdId][mp_index];
+            break;
+          case Enums::PF_WF:
+            last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
+          default:
+            break;
+        }
+
+        DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
+                computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
+
+        int stride = last ? (roundDown(vaddr, TheISA::PageBytes) -
+                     roundDown(last, TheISA::PageBytes)) >> TheISA::PageShift
+                     : 0;
+
+        DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
+
+        computeUnit->lastVaddrCU[mp_index] = vaddr;
+        computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr;
+        computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
+
+        stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
+            computeUnit->prefetchStride: stride;
+
+        DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
+                computeUnit->cu_id, simdId, wfSlotId, mp_index);
+
+        DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
+
+        // Prefetch Next few pages atomically
+        for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
+            DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
+                    vaddr+stride*pf*TheISA::PageBytes);
+
+            if (!stride)
+                break;
+
+            Request *prefetch_req = new Request(0, vaddr + stride * pf *
+                                                TheISA::PageBytes,
+                                                sizeof(uint8_t), 0,
+                                                computeUnit->masterId(),
+                                                0, 0, 0);
+
+            PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
+            uint8_t foo = 0;
+            prefetch_pkt->dataStatic(&foo);
+
+            // Because it's atomic operation, only need TLB translation state
+            prefetch_pkt->senderState =
+                new TheISA::GpuTLB::TranslationState(TLB_mode,
+                                                     computeUnit->shader->gpuTc,
+                                                     true);
+
+            // Currently prefetches are zero-latency, hence the sendFunctional
+            sendFunctional(prefetch_pkt);
+
+            /* safe_cast the senderState */
+            TheISA::GpuTLB::TranslationState *tlb_state =
+                 safe_cast<TheISA::GpuTLB::TranslationState*>(
+                         prefetch_pkt->senderState);
+
+
+            delete tlb_state->tlbEntry;
+            delete tlb_state;
+            delete prefetch_pkt->req;
+            delete prefetch_pkt;
+        }
+    }
+
+    // First we must convert the response cmd back to a request cmd so that
+    // the request can be sent through the cu's master port
+    PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
+    new_pkt->dataStatic(pkt->getPtr<uint8_t>());
+    delete pkt->senderState;
+    delete pkt;
+
+    // New SenderState for the memory access
+    new_pkt->senderState =
+            new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
+                                                   nullptr);
+
+    // translation is done. Schedule the mem_req_event at the appropriate
+    // cycle to send the timing memory request to ruby
+    ComputeUnit::DataPort::MemReqEvent *mem_req_event =
+        new ComputeUnit::DataPort::MemReqEvent(computeUnit->memPort[mp_index],
+                                               new_pkt);
+
+    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
+            computeUnit->cu_id, gpuDynInst->simdId,
+            gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
+
+    computeUnit->schedule(mem_req_event, curTick() +
+                          computeUnit->req_tick_latency);
+
+    return true;
+}
+
+const char*
+ComputeUnit::DataPort::MemReqEvent::description() const
+{
+    return "ComputeUnit memory request event";
+}
+
+void
+ComputeUnit::DataPort::MemReqEvent::process()
+{
+    SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+    ComputeUnit *compute_unit M5_VAR_USED = dataPort->computeUnit;
+
+    if (!(dataPort->sendTimingReq(pkt))) {
+        dataPort->retries.push_back(std::make_pair(pkt, gpuDynInst));
+
+        DPRINTF(GPUPort,
+                "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
+                compute_unit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, dataPort->index,
+                pkt->req->getPaddr());
+    } else {
+        DPRINTF(GPUPort,
+                "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
+                compute_unit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, dataPort->index,
+                pkt->req->getPaddr());
+    }
+}
+
+/*
+ * The initial translation request could have been rejected,
+ * if <retries> queue is not Retry sending the translation
+ * request. sendRetry() is called from the peer port whenever
+ * a translation completes.
+ */
+void
+ComputeUnit::DTLBPort::recvReqRetry()
+{
+    int len = retries.size();
+
+    DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
+            computeUnit->cu_id, len);
+
+    assert(len > 0);
+    assert(isStalled());
+    // recvReqRetry is an indication that the resource on which this
+    // port was stalling on is freed. So, remove the stall first
+    unstallPort();
+
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = retries.front();
+        Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
+        DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
+
+        if (!sendTimingReq(pkt)) {
+            // Stall port
+            stallPort();
+            DPRINTF(GPUTLB, ": failed again\n");
+            break;
+        } else {
+            DPRINTF(GPUTLB, ": successful\n");
+            retries.pop_front();
+        }
+    }
+}
+
+bool
+ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt)
+{
+    Addr line M5_VAR_USED = pkt->req->getPaddr();
+    DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
+            computeUnit->cu_id, pkt->req->getVaddr(), line);
+
+    assert(pkt->senderState);
+
+    // pop off the TLB translation state
+    TheISA::GpuTLB::TranslationState *translation_state =
+                 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    bool success = translation_state->tlbEntry->valid;
+    delete translation_state->tlbEntry;
+    assert(!translation_state->ports.size());
+    pkt->senderState = translation_state->saved;
+    delete translation_state;
+
+    // use the original sender state to know how to close this transaction
+    ITLBPort::SenderState *sender_state =
+        safe_cast<ITLBPort::SenderState*>(pkt->senderState);
+
+    // get the wavefront associated with this translation request
+    Wavefront *wavefront = sender_state->wavefront;
+    delete pkt->senderState;
+
+    if (success) {
+        // pkt is reused in fetch(), don't delete it here.  However, we must
+        // reset the command to be a request so that it can be sent through
+        // the cu's master port
+        assert(pkt->cmd == MemCmd::ReadResp);
+        pkt->cmd = MemCmd::ReadReq;
+
+        computeUnit->fetchStage.fetch(pkt, wavefront);
+    } else {
+        if (wavefront->dropFetch) {
+            assert(wavefront->instructionBuffer.empty());
+            wavefront->dropFetch = false;
+        }
+
+        wavefront->pendingFetch = 0;
+    }
+
+    return true;
+}
+
+/*
+ * The initial translation request could have been rejected, if
+ * <retries> queue is not empty. Retry sending the translation
+ * request. sendRetry() is called from the peer port whenever
+ * a translation completes.
+ */
+void
+ComputeUnit::ITLBPort::recvReqRetry()
+{
+
+    int len = retries.size();
+    DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
+
+    assert(len > 0);
+    assert(isStalled());
+
+    // recvReqRetry is an indication that the resource on which this
+    // port was stalling on is freed. So, remove the stall first
+    unstallPort();
+
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = retries.front();
+        Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
+        DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
+
+        if (!sendTimingReq(pkt)) {
+            stallPort(); // Stall port
+            DPRINTF(GPUTLB, ": failed again\n");
+            break;
+        } else {
+            DPRINTF(GPUTLB, ": successful\n");
+            retries.pop_front();
+        }
+    }
+}
+
+void
+ComputeUnit::regStats()
+{
+    tlbCycles
+        .name(name() + ".tlb_cycles")
+        .desc("total number of cycles for all uncoalesced requests")
+        ;
+
+    tlbRequests
+        .name(name() + ".tlb_requests")
+        .desc("number of uncoalesced requests")
+        ;
+
+    tlbLatency
+        .name(name() + ".avg_translation_latency")
+        .desc("Avg. translation latency for data translations")
+        ;
+
+    tlbLatency = tlbCycles / tlbRequests;
+
+    hitsPerTLBLevel
+       .init(4)
+       .name(name() + ".TLB_hits_distribution")
+       .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
+       ;
+
+    // fixed number of TLB levels
+    for (int i = 0; i < 4; ++i) {
+        if (!i)
+            hitsPerTLBLevel.subname(i,"page_table");
+        else
+            hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
+    }
+
+    execRateDist
+        .init(0, 10, 2)
+        .name(name() + ".inst_exec_rate")
+        .desc("Instruction Execution Rate: Number of executed vector "
+              "instructions per cycle")
+        ;
+
+    ldsBankConflictDist
+       .init(0, VSZ, 2)
+       .name(name() + ".lds_bank_conflicts")
+       .desc("Number of bank conflicts per LDS memory packet")
+       ;
+
+    ldsBankAccesses
+        .name(name() + ".lds_bank_access_cnt")
+        .desc("Total number of LDS bank accesses")
+        ;
+
+    pageDivergenceDist
+       // A wavefront can touch 1 to VSZ pages per memory instruction.
+       // The number of pages per bin can be configured (here it's 4).
+       .init(1, VSZ, 4)
+       .name(name() + ".page_divergence_dist")
+       .desc("pages touched per wf (over all mem. instr.)")
+       ;
+
+    controlFlowDivergenceDist
+        .init(1, VSZ, 4)
+        .name(name() + ".warp_execution_dist")
+        .desc("number of lanes active per instruction (oval all instructions)")
+        ;
+
+    activeLanesPerGMemInstrDist
+        .init(1, VSZ, 4)
+        .name(name() + ".gmem_lanes_execution_dist")
+        .desc("number of active lanes per global memory instruction")
+        ;
+
+    activeLanesPerLMemInstrDist
+        .init(1, VSZ, 4)
+        .name(name() + ".lmem_lanes_execution_dist")
+        .desc("number of active lanes per local memory instruction")
+        ;
+
+    numInstrExecuted
+        .name(name() + ".num_instr_executed")
+        .desc("number of instructions executed")
+        ;
+
+    numVecOpsExecuted
+        .name(name() + ".num_vec_ops_executed")
+        .desc("number of vec ops executed (e.g. VSZ/inst)")
+        ;
+
+    totalCycles
+        .name(name() + ".num_total_cycles")
+        .desc("number of cycles the CU ran for")
+        ;
+
+    ipc
+        .name(name() + ".ipc")
+        .desc("Instructions per cycle (this CU only)")
+        ;
+
+    vpc
+        .name(name() + ".vpc")
+        .desc("Vector Operations per cycle (this CU only)")
+        ;
+
+    numALUInstsExecuted
+        .name(name() + ".num_alu_insts_executed")
+        .desc("Number of dynamic non-GM memory insts executed")
+        ;
+
+    wgBlockedDueLdsAllocation
+        .name(name() + ".wg_blocked_due_lds_alloc")
+        .desc("Workgroup blocked due to LDS capacity")
+        ;
+
+    ipc = numInstrExecuted / totalCycles;
+    vpc = numVecOpsExecuted / totalCycles;
+
+    numTimesWgBlockedDueVgprAlloc
+        .name(name() + ".times_wg_blocked_due_vgpr_alloc")
+        .desc("Number of times WGs are blocked due to VGPR allocation per SIMD")
+        ;
+
+    dynamicGMemInstrCnt
+        .name(name() + ".global_mem_instr_cnt")
+        .desc("dynamic global memory instructions count")
+        ;
+
+    dynamicLMemInstrCnt
+        .name(name() + ".local_mem_instr_cnt")
+        .desc("dynamic local memory intruction count")
+        ;
+
+    numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
+        dynamicLMemInstrCnt;
+
+    completedWfs
+        .name(name() + ".num_completed_wfs")
+        .desc("number of completed wavefronts")
+        ;
+
+    numCASOps
+        .name(name() + ".num_CAS_ops")
+        .desc("number of compare and swap operations")
+        ;
+
+    numFailedCASOps
+        .name(name() + ".num_failed_CAS_ops")
+        .desc("number of compare and swap operations that failed")
+        ;
+
+    // register stats of pipeline stages
+    fetchStage.regStats();
+    scoreboardCheckStage.regStats();
+    scheduleStage.regStats();
+    execStage.regStats();
+
+    // register stats of memory pipeline
+    globalMemoryPipe.regStats();
+    localMemoryPipe.regStats();
+}
+
+void
+ComputeUnit::updatePageDivergenceDist(Addr addr)
+{
+    Addr virt_page_addr = roundDown(addr, TheISA::PageBytes);
+
+    if (!pagesTouched.count(virt_page_addr))
+        pagesTouched[virt_page_addr] = 1;
+    else
+        pagesTouched[virt_page_addr]++;
+}
+
+void
+ComputeUnit::CUExitCallback::process()
+{
+    if (computeUnit->countPages) {
+        std::ostream *page_stat_file =
+            simout.create(computeUnit->name().c_str());
+
+        *page_stat_file << "page, wavefront accesses, workitem accesses" <<
+            std::endl;
+
+        for (auto iter : computeUnit->pageAccesses) {
+            *page_stat_file << std::hex << iter.first << ",";
+            *page_stat_file << std::dec << iter.second.first << ",";
+            *page_stat_file << std::dec << iter.second.second << std::endl;
+        }
+    }
+ }
+
+bool
+ComputeUnit::isDone() const
+{
+    for (int i = 0; i < numSIMDs; ++i) {
+        if (!isSimdDone(i)) {
+            return false;
+        }
+    }
+
+    bool glbMemBusRdy = true;
+    for (int j = 0; j < numGlbMemUnits; ++j) {
+        glbMemBusRdy &= vrfToGlobalMemPipeBus[j].rdy();
+    }
+    bool locMemBusRdy = true;
+    for (int j = 0; j < numLocMemUnits; ++j) {
+        locMemBusRdy &= vrfToLocalMemPipeBus[j].rdy();
+    }
+
+    if (!globalMemoryPipe.isGMLdRespFIFOWrRdy() ||
+        !globalMemoryPipe.isGMStRespFIFOWrRdy() ||
+        !globalMemoryPipe.isGMReqFIFOWrRdy() || !localMemoryPipe.isLMReqFIFOWrRdy()
+        || !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() ||
+        !glbMemToVrfBus.rdy() || !locMemBusRdy || !glbMemBusRdy) {
+        return false;
+    }
+
+    return true;
+}
+
+int32_t
+ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
+{
+    return lds.getRefCounter(dispatchId, wgId);
+}
+
+bool
+ComputeUnit::isSimdDone(uint32_t simdId) const
+{
+    assert(simdId < numSIMDs);
+
+    for (int i=0; i < numGlbMemUnits; ++i) {
+        if (!vrfToGlobalMemPipeBus[i].rdy())
+            return false;
+    }
+    for (int i=0; i < numLocMemUnits; ++i) {
+        if (!vrfToLocalMemPipeBus[i].rdy())
+            return false;
+    }
+    if (!aluPipe[simdId].rdy()) {
+        return false;
+    }
+
+    for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
+        if (wfList[simdId][i_wf]->status != Wavefront::S_STOPPED) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+/**
+ * send a general request to the LDS
+ * make sure to look at the return value here as your request might be
+ * NACK'd and returning false means that you have to have some backup plan
+ */
+bool
+ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst)
+{
+    // this is just a request to carry the GPUDynInstPtr
+    // back and forth
+    Request *newRequest = new Request();
+    newRequest->setPaddr(0x0);
+
+    // ReadReq is not evaluted by the LDS but the Packet ctor requires this
+    PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
+
+    // This is the SenderState needed upon return
+    newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
+
+    return ldsPort->sendTimingReq(newPacket);
+}
+
+/**
+ * get the result of packets sent to the LDS when they return
+ */
+bool
+ComputeUnit::LDSPort::recvTimingResp(PacketPtr packet)
+{
+    const ComputeUnit::LDSPort::SenderState *senderState =
+        dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
+
+    fatal_if(!senderState, "did not get the right sort of sender state");
+
+    GPUDynInstPtr gpuDynInst = senderState->getMemInst();
+
+    delete packet->senderState;
+    delete packet->req;
+    delete packet;
+
+    computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
+    return true;
+}
+
+/**
+ * attempt to send this packet, either the port is already stalled, the request
+ * is nack'd and must stall or the request goes through
+ * when a request cannot be sent, add it to the retries queue
+ */
+bool
+ComputeUnit::LDSPort::sendTimingReq(PacketPtr pkt)
+{
+    ComputeUnit::LDSPort::SenderState *sender_state =
+            dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
+    fatal_if(!sender_state, "packet without a valid sender state");
+
+    GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst();
+
+    if (isStalled()) {
+        fatal_if(retries.empty(), "must have retries waiting to be stalled");
+
+        retries.push(pkt);
+
+        DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
+                        computeUnit->cu_id, gpuDynInst->simdId,
+                        gpuDynInst->wfSlotId);
+        return false;
+    } else if (!MasterPort::sendTimingReq(pkt)) {
+        // need to stall the LDS port until a recvReqRetry() is received
+        // this indicates that there is more space
+        stallPort();
+        retries.push(pkt);
+
+        DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
+                computeUnit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, pkt->req->getPaddr());
+        return false;
+    } else {
+        DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
+                computeUnit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, pkt->req->getPaddr());
+        return true;
+    }
+}
+
+/**
+ * the bus is telling the port that there is now space so retrying stalled
+ * requests should work now
+ * this allows the port to have a request be nack'd and then have the receiver
+ * say when there is space, rather than simply retrying the send every cycle
+ */
+void
+ComputeUnit::LDSPort::recvReqRetry()
+{
+    auto queueSize = retries.size();
+
+    DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
+            computeUnit->cu_id, queueSize);
+
+    fatal_if(queueSize < 1,
+             "why was there a recvReqRetry() with no pending reqs?");
+    fatal_if(!isStalled(),
+             "recvReqRetry() happened when the port was not stalled");
+
+    unstallPort();
+
+    while (!retries.empty()) {
+        PacketPtr packet = retries.front();
+
+        DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
+
+        if (!MasterPort::sendTimingReq(packet)) {
+            // Stall port
+            stallPort();
+            DPRINTF(GPUPort, ": LDS send failed again\n");
+            break;
+        } else {
+            DPRINTF(GPUTLB, ": LDS send successful\n");
+            retries.pop();
+        }
+    }
+}
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
new file mode 100644
index 000000000..f47c27a0a
--- /dev/null
+++ b/src/gpu-compute/compute_unit.hh
@@ -0,0 +1,767 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Anthony Gutierrez
+ */
+
+#ifndef __COMPUTE_UNIT_HH__
+#define __COMPUTE_UNIT_HH__
+
+#include <deque>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#include "base/callback.hh"
+#include "base/statistics.hh"
+#include "base/types.hh"
+#include "enums/PrefetchType.hh"
+#include "gpu-compute/exec_stage.hh"
+#include "gpu-compute/fetch_stage.hh"
+#include "gpu-compute/global_memory_pipeline.hh"
+#include "gpu-compute/local_memory_pipeline.hh"
+#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/schedule_stage.hh"
+#include "gpu-compute/scoreboard_check_stage.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+
+static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
+static const int MAX_WIDTH_FOR_MEM_INST = 32;
+
+class NDRange;
+class Shader;
+class VectorRegisterFile;
+
+struct ComputeUnitParams;
+
+enum EXEC_POLICY
+{
+    OLDEST = 0,
+    RR
+};
+
+// List of execution units
+enum EXEC_UNIT
+{
+    SIMD0 = 0,
+    SIMD1,
+    SIMD2,
+    SIMD3,
+    GLBMEM_PIPE,
+    LDSMEM_PIPE,
+    NUM_UNITS
+};
+
+enum TLB_CACHE
+{
+    TLB_MISS_CACHE_MISS = 0,
+    TLB_MISS_CACHE_HIT,
+    TLB_HIT_CACHE_MISS,
+    TLB_HIT_CACHE_HIT
+};
+
+class ComputeUnit : public MemObject
+{
+  public:
+    FetchStage fetchStage;
+    ScoreboardCheckStage scoreboardCheckStage;
+    ScheduleStage scheduleStage;
+    ExecStage execStage;
+    GlobalMemPipeline globalMemoryPipe;
+    LocalMemPipeline localMemoryPipe;
+
+    // Buffers used to communicate between various pipeline stages
+
+    // List of waves which are ready to be scheduled.
+    // Each execution resource has a ready list. readyList is
+    // used to communicate between scoreboardCheck stage and
+    // schedule stage
+    // TODO: make enum to index readyList
+    std::vector<std::vector<Wavefront*>> readyList;
+
+    // Stores the status of waves. A READY implies the
+    // wave is ready to be scheduled this cycle and
+    // is already present in the readyList. waveStatusList is
+    // used to communicate between scoreboardCheck stage and
+    // schedule stage
+    // TODO: convert std::pair to a class to increase readability
+    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
+
+    // List of waves which will be dispatched to
+    // each execution resource. A FILLED implies
+    // dispatch list is non-empty and
+    // execution unit has something to execute
+    // this cycle. Currently, the dispatch list of
+    // an execution resource can hold only one wave because
+    // an execution resource can execute only one wave in a cycle.
+    // dispatchList is used to communicate between schedule
+    // and exec stage
+    // TODO: convert std::pair to a class to increase readability
+    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
+
+    int rrNextMemID; // used by RR WF exec policy to cycle through WF's
+    int rrNextALUWp;
+    typedef ComputeUnitParams Params;
+    std::vector<std::vector<Wavefront*>> wfList;
+    int cu_id;
+
+    // array of vector register files, one per SIMD
+    std::vector<VectorRegisterFile*> vrf;
+    // Number of vector ALU units (SIMDs) in CU
+    int numSIMDs;
+    // number of pipe stages for bypassing data to next dependent single
+    // precision vector instruction inside the vector ALU pipeline
+    int spBypassPipeLength;
+    // number of pipe stages for bypassing data to next dependent double
+    // precision vector instruction inside the vector ALU pipeline
+    int dpBypassPipeLength;
+    // number of cycles per issue period
+    int issuePeriod;
+
+    // Number of global and local memory execution resources in CU
+    int numGlbMemUnits;
+    int numLocMemUnits;
+    // tracks the last cycle a vector instruction was executed on a SIMD
+    std::vector<uint64_t> lastExecCycle;
+
+    // true if we allow a separate TLB per lane
+    bool perLaneTLB;
+    // if 0, TLB prefetching is off.
+    int prefetchDepth;
+    // if fixed-stride prefetching, this is the stride.
+    int prefetchStride;
+
+    class LastVaddrWave
+    {
+      public:
+        Addr vaddrs[VSZ];
+        Addr& operator[](int idx) {
+            return vaddrs[idx];
+        }
+
+        LastVaddrWave() {
+            for (int i = 0; i < VSZ; ++i)
+                vaddrs[i] = 0;
+        }
+    };
+
+    LastVaddrWave lastVaddrCU;
+    std::vector<LastVaddrWave> lastVaddrPhase;
+    std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
+    Enums::PrefetchType prefetchType;
+    EXEC_POLICY exec_policy;
+
+    bool xact_cas_mode;
+    bool debugSegFault;
+    bool functionalTLB;
+    bool localMemBarrier;
+
+    /*
+     * for Counting page accesses
+     *
+     * cuExitCallback inherits from Callback. When you register a callback
+     * function as an exit callback, it will get added to an exit callback
+     * queue, such that on simulation exit, all callbacks in the callback
+     * queue will have their process() function called.
+     */
+    bool countPages;
+
+    Shader *shader;
+    uint32_t barrier_id;
+    // vector of Vector ALU (MACC) pipelines
+    std::vector<WaitClass> aluPipe;
+    // minimum issue period per SIMD unit (in cycles)
+    std::vector<WaitClass> wfWait;
+
+    // Resource control for Vector Register File->Global Memory pipe buses
+    std::vector<WaitClass> vrfToGlobalMemPipeBus;
+    // Resource control for Vector Register File->Local Memory pipe buses
+    std::vector<WaitClass> vrfToLocalMemPipeBus;
+    int nextGlbMemBus;
+    int nextLocMemBus;
+    // Resource control for global memory to VRF data/address bus
+    WaitClass glbMemToVrfBus;
+    // Resource control for local memory to VRF data/address bus
+    WaitClass locMemToVrfBus;
+
+    uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
+    uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
+    uint32_t numCyclesPerStoreTransfer;  // number of cycles per vector store
+    uint32_t numCyclesPerLoadTransfer;  // number of cycles per vector load
+
+    Tick req_tick_latency;
+    Tick resp_tick_latency;
+
+    // number of vector registers being reserved for each SIMD unit
+    std::vector<int> vectorRegsReserved;
+    // number of vector registers per SIMD unit
+    uint32_t numVecRegsPerSimd;
+    // Support for scheduling VGPR status update events
+    std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
+    std::vector<uint64_t> timestampVec;
+    std::vector<uint8_t>  statusVec;
+
+    void
+    registerEvent(uint32_t simdId,
+                  uint32_t regIdx,
+                  uint32_t operandSize,
+                  uint64_t when,
+                  uint8_t newStatus) {
+        regIdxVec.push_back(std::make_pair(simdId, regIdx));
+        timestampVec.push_back(when);
+        statusVec.push_back(newStatus);
+        if (operandSize > 4) {
+            regIdxVec.push_back(std::make_pair(simdId,
+                                               ((regIdx + 1) %
+                                                numVecRegsPerSimd)));
+            timestampVec.push_back(when);
+            statusVec.push_back(newStatus);
+        }
+    }
+
+    void updateEvents();
+
+    // this hash map will keep track of page divergence
+    // per memory instruction per wavefront. The hash map
+    // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
+    std::map<Addr, int> pagesTouched;
+
+    ComputeUnit(const Params *p);
+    ~ComputeUnit();
+    int spBypassLength() { return spBypassPipeLength; };
+    int dpBypassLength() { return dpBypassPipeLength; };
+    int storeBusLength() { return numCyclesPerStoreTransfer; };
+    int loadBusLength() { return numCyclesPerLoadTransfer; };
+    int wfSize() const { return wavefrontSize; };
+
+    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+    void exec();
+    void initiateFetch(Wavefront *wavefront);
+    void fetch(PacketPtr pkt, Wavefront *wavefront);
+    void FillKernelState(Wavefront *w, NDRange *ndr);
+
+    void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
+                 int trueWgSizeTotal);
+
+    void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
+                             int trueWgSize[], int trueWgSizeTotal,
+                             LdsChunk *ldsChunk, uint64_t origSpillMemStart);
+
+    void StartWorkgroup(NDRange *ndr);
+    int ReadyWorkgroup(NDRange *ndr);
+
+    bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
+    bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
+    bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
+    int GlbMemUnitId() { return GLBMEM_PIPE; }
+    int ShrMemUnitId() { return LDSMEM_PIPE; }
+    int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
+    int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
+    /* This function cycles through all the wavefronts in all the phases to see
+     * if all of the wavefronts which should be associated with one barrier
+     * (denoted with _barrier_id), are all at the same barrier in the program
+     * (denoted by bcnt). When the number at the barrier matches bslots, then
+     * return true.
+     */
+    int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
+    bool cedeSIMD(int simdId, int wfSlotId);
+
+    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
+    virtual void init();
+    void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
+    void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
+    void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
+                              bool kernelLaunch=true,
+                              RequestPtr req=nullptr);
+    void handleMemPacket(PacketPtr pkt, int memport_index);
+    bool processTimingPacket(PacketPtr pkt);
+    void processFetchReturn(PacketPtr pkt);
+    void updatePageDivergenceDist(Addr addr);
+
+    MasterID masterId() { return _masterId; }
+
+    bool isDone() const;
+    bool isSimdDone(uint32_t) const;
+
+  protected:
+    MasterID _masterId;
+
+    LdsState &lds;
+
+  public:
+    // the following stats compute the avg. TLB accesslatency per
+    // uncoalesced request (only for data)
+    Stats::Scalar tlbRequests;
+    Stats::Scalar tlbCycles;
+    Stats::Formula tlbLatency;
+    // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
+    Stats::Vector hitsPerTLBLevel;
+
+    Stats::Scalar ldsBankAccesses;
+    Stats::Distribution ldsBankConflictDist;
+
+    // over all memory instructions executed over all wavefronts
+    // how many touched 0-4 pages, 4-8, ..., 60-64 pages
+    Stats::Distribution pageDivergenceDist;
+    Stats::Scalar dynamicGMemInstrCnt;
+    Stats::Scalar dynamicLMemInstrCnt;
+
+    Stats::Scalar wgBlockedDueLdsAllocation;
+    // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
+    // when the instruction is committed, this number is still incremented by 1
+    Stats::Scalar numInstrExecuted;
+    // Number of cycles among successive instruction executions across all
+    // wavefronts of the same CU
+    Stats::Distribution execRateDist;
+    // number of individual vector operations executed
+    Stats::Scalar numVecOpsExecuted;
+    // Total cycles that something is running on the GPU
+    Stats::Scalar totalCycles;
+    Stats::Formula vpc; // vector ops per cycle
+    Stats::Formula ipc; // vector instructions per cycle
+    Stats::Distribution controlFlowDivergenceDist;
+    Stats::Distribution activeLanesPerGMemInstrDist;
+    Stats::Distribution activeLanesPerLMemInstrDist;
+    // number of vector ALU instructions received
+    Stats::Formula numALUInstsExecuted;
+    // number of times a WG can not start due to lack of free VGPRs in SIMDs
+    Stats::Scalar numTimesWgBlockedDueVgprAlloc;
+    Stats::Scalar numCASOps;
+    Stats::Scalar numFailedCASOps;
+    Stats::Scalar completedWfs;
+    // flag per vector SIMD unit that is set when there is at least one
+    // WV that has a vector ALU instruction as the oldest in its
+    // Instruction Buffer: Defined in the Scoreboard stage, consumed
+    // by the Execute stage.
+    std::vector<bool> vectorAluInstAvail;
+    // number of available (oldest) LDS instructions that could have
+    // been issued to the LDS at a specific issue slot
+    int shrMemInstAvail;
+    // number of available Global memory instructions that could have
+    // been issued to TCP at a specific issue slot
+    int glbMemInstAvail;
+
+    void
+    regStats();
+
+    LdsState &
+    getLds() const
+    {
+        return lds;
+    }
+
+    int32_t
+    getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
+
+    bool
+    sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
+
+    typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
+    pageDataStruct pageAccesses;
+
+    class CUExitCallback : public Callback
+    {
+      private:
+        ComputeUnit *computeUnit;
+
+      public:
+        virtual ~CUExitCallback() { }
+
+        CUExitCallback(ComputeUnit *_cu)
+        {
+            computeUnit = _cu;
+        }
+
+        virtual void
+        process();
+    };
+
+    CUExitCallback *cuExitCallback;
+
+    /** Data access Port **/
+    class DataPort : public MasterPort
+    {
+      public:
+        DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+            : MasterPort(_name, _cu), computeUnit(_cu),
+              index(_index) { }
+
+        bool snoopRangeSent;
+
+        struct SenderState : public Packet::SenderState
+        {
+            GPUDynInstPtr _gpuDynInst;
+            int port_index;
+            Packet::SenderState *saved;
+
+            SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
+                        Packet::SenderState *sender_state=nullptr)
+                : _gpuDynInst(gpuDynInst),
+                  port_index(_port_index),
+                  saved(sender_state) { }
+        };
+
+        class MemReqEvent : public Event
+        {
+          private:
+            DataPort *dataPort;
+            PacketPtr pkt;
+
+          public:
+            MemReqEvent(DataPort *_data_port, PacketPtr _pkt)
+                : Event(), dataPort(_data_port), pkt(_pkt)
+            {
+              setFlags(Event::AutoDelete);
+            }
+
+            void process();
+            const char *description() const;
+        };
+
+        class MemRespEvent : public Event
+        {
+          private:
+            DataPort *dataPort;
+            PacketPtr pkt;
+
+          public:
+            MemRespEvent(DataPort *_data_port, PacketPtr _pkt)
+                : Event(), dataPort(_data_port), pkt(_pkt)
+            {
+              setFlags(Event::AutoDelete);
+            }
+
+            void process();
+            const char *description() const;
+        };
+
+        std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
+
+      protected:
+        ComputeUnit *computeUnit;
+        int index;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt) { }
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+
+        virtual void
+        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
+        {
+            resp.clear();
+            snoop = true;
+        }
+
+    };
+
+    // Instruction cache access port
+    class SQCPort : public MasterPort
+    {
+      public:
+        SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+            : MasterPort(_name, _cu), computeUnit(_cu),
+              index(_index) { }
+
+        bool snoopRangeSent;
+
+        struct SenderState : public Packet::SenderState
+        {
+            Wavefront *wavefront;
+            Packet::SenderState *saved;
+
+            SenderState(Wavefront *_wavefront, Packet::SenderState
+                    *sender_state=nullptr)
+                : wavefront(_wavefront), saved(sender_state) { }
+        };
+
+        std::deque<std::pair<PacketPtr, Wavefront*>> retries;
+
+      protected:
+        ComputeUnit *computeUnit;
+        int index;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt) { }
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+
+        virtual void
+        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
+        {
+            resp.clear();
+            snoop = true;
+        }
+     };
+
+    /** Data TLB port **/
+    class DTLBPort : public MasterPort
+    {
+      public:
+        DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+            : MasterPort(_name, _cu), computeUnit(_cu),
+              index(_index), stalled(false)
+        { }
+
+        bool isStalled() { return stalled; }
+        void stallPort() { stalled = true; }
+        void unstallPort() { stalled = false; }
+
+        /**
+         * here we queue all the translation requests that were
+         * not successfully sent.
+         */
+        std::deque<PacketPtr> retries;
+
+        /** SenderState is information carried along with the packet
+         * throughout the TLB hierarchy
+         */
+        struct SenderState: public Packet::SenderState
+        {
+            // the memInst that this is associated with
+            GPUDynInstPtr _gpuDynInst;
+
+            // the lane in the memInst this is associated with, so we send
+            // the memory request down the right port
+            int portIndex;
+
+            // constructor used for packets involved in timing accesses
+            SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
+                : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
+
+        };
+
+      protected:
+        ComputeUnit *computeUnit;
+        int index;
+        bool stalled;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt) { }
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+    };
+
+    class ITLBPort : public MasterPort
+    {
+      public:
+        ITLBPort(const std::string &_name, ComputeUnit *_cu)
+            : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
+
+
+        bool isStalled() { return stalled; }
+        void stallPort() { stalled = true; }
+        void unstallPort() { stalled = false; }
+
+        /**
+         * here we queue all the translation requests that were
+         * not successfully sent.
+         */
+        std::deque<PacketPtr> retries;
+
+        /** SenderState is information carried along with the packet
+         * throughout the TLB hierarchy
+         */
+        struct SenderState: public Packet::SenderState
+        {
+            // The wavefront associated with this request
+            Wavefront *wavefront;
+
+            SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
+        };
+
+      protected:
+        ComputeUnit *computeUnit;
+        bool stalled;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt) { }
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+    };
+
+    /**
+     * the port intended to communicate between the CU and its LDS
+     */
+    class LDSPort : public MasterPort
+    {
+      public:
+        LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
+        : MasterPort(_name, _cu, _id), computeUnit(_cu)
+        {
+        }
+
+        bool isStalled() const { return stalled; }
+        void stallPort() { stalled = true; }
+        void unstallPort() { stalled = false; }
+
+        /**
+         * here we queue all the requests that were
+         * not successfully sent.
+         */
+        std::queue<PacketPtr> retries;
+
+        /**
+         *  SenderState is information carried along with the packet, esp. the
+         *  GPUDynInstPtr
+         */
+        class SenderState: public Packet::SenderState
+        {
+          protected:
+            // The actual read/write/atomic request that goes with this command
+            GPUDynInstPtr _gpuDynInst = nullptr;
+
+          public:
+            SenderState(GPUDynInstPtr gpuDynInst):
+              _gpuDynInst(gpuDynInst)
+            {
+            }
+
+            GPUDynInstPtr
+            getMemInst() const
+            {
+              return _gpuDynInst;
+            }
+        };
+
+        virtual bool
+        sendTimingReq(PacketPtr pkt);
+
+      protected:
+
+        bool stalled = false; ///< whether or not it is stalled
+
+        ComputeUnit *computeUnit;
+
+        virtual bool
+        recvTimingResp(PacketPtr pkt);
+
+        virtual Tick
+        recvAtomic(PacketPtr pkt) { return 0; }
+
+        virtual void
+        recvFunctional(PacketPtr pkt)
+        {
+        }
+
+        virtual void
+        recvRangeChange()
+        {
+        }
+
+        virtual void
+        recvReqRetry();
+    };
+
+    /** The port to access the Local Data Store
+     *  Can be connected to a LDS object
+     */
+    LDSPort *ldsPort = nullptr;
+
+    LDSPort *
+    getLdsPort() const
+    {
+        return ldsPort;
+    }
+
+    /** The memory port for SIMD data accesses.
+     *  Can be connected to PhysMem for Ruby for timing simulations
+     */
+    std::vector<DataPort*> memPort;
+    // port to the TLB hierarchy (i.e., the L1 TLB)
+    std::vector<DTLBPort*> tlbPort;
+    // port to the SQC (i.e. the I-cache)
+    SQCPort *sqcPort;
+    // port to the SQC TLB (there's a separate TLB for each I-cache)
+    ITLBPort *sqcTLBPort;
+
+    virtual BaseMasterPort&
+    getMasterPort(const std::string &if_name, PortID idx)
+    {
+        if (if_name == "memory_port") {
+            memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
+                                        this, idx);
+            return *memPort[idx];
+        } else if (if_name == "translation_port") {
+            tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
+                                        this, idx);
+            return *tlbPort[idx];
+        } else if (if_name == "sqc_port") {
+            sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
+                                  this, idx);
+            return *sqcPort;
+        } else if (if_name == "sqc_tlb_port") {
+            sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
+            return *sqcTLBPort;
+        } else if (if_name == "ldsPort") {
+            if (ldsPort) {
+                fatal("an LDS port was already allocated");
+            }
+            ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
+            return *ldsPort;
+        } else {
+            panic("incorrect port name");
+        }
+    }
+
+    // xact_cas_load()
+    class waveIdentifier
+    {
+      public:
+        waveIdentifier() { }
+        waveIdentifier(int _simdId, int _wfSlotId)
+          : simdId(_simdId), wfSlotId(_wfSlotId) { }
+
+        int simdId;
+        int wfSlotId;
+    };
+
+    class waveQueue
+    {
+      public:
+        std::list<waveIdentifier> waveIDQueue;
+    };
+    std::map<unsigned, waveQueue> xactCasLoadMap;
+
+    uint64_t getAndIncSeqNum() { return globalSeqNum++; }
+
+  private:
+    uint64_t globalSeqNum;
+    int wavefrontSize;
+};
+
+#endif // __COMPUTE_UNIT_HH__
diff --git a/src/gpu-compute/condition_register_state.cc b/src/gpu-compute/condition_register_state.cc
new file mode 100644
index 000000000..f3f2d2927
--- /dev/null
+++ b/src/gpu-compute/condition_register_state.cc
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/condition_register_state.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+
+ConditionRegisterState::ConditionRegisterState()
+{
+    computeUnit = nullptr;
+    c_reg.clear();
+    busy.clear();
+}
+
+void
+ConditionRegisterState::setParent(ComputeUnit *_computeUnit)
+{
+    computeUnit = _computeUnit;
+    _name = computeUnit->name() + ".CondRegState";
+}
+
+void
+ConditionRegisterState::init(uint32_t _size)
+{
+    c_reg.resize(_size);
+    busy.resize(_size, 0);
+}
+
+void
+ConditionRegisterState::exec(GPUStaticInst *ii, Wavefront *w)
+{
+    // iterate over all operands
+    for (auto i = 0; i < ii->getNumOperands(); ++i) {
+        // is this a condition register destination operand?
+        if (ii->isCondRegister(i) && ii->isDstOperand(i)) {
+            // mark the register as busy
+            markReg(ii->getRegisterIndex(i), 1);
+            uint32_t pipeLen =  w->computeUnit->spBypassLength();
+
+            // schedule an event for marking the register as ready
+            w->computeUnit->
+                registerEvent(w->simdId, ii->getRegisterIndex(i),
+                              ii->getOperandSize(i),
+                              w->computeUnit->shader->tick_cnt +
+                              w->computeUnit->shader->ticks(pipeLen), 0);
+        }
+    }
+}
diff --git a/src/gpu-compute/condition_register_state.hh b/src/gpu-compute/condition_register_state.hh
new file mode 100644
index 000000000..139874a66
--- /dev/null
+++ b/src/gpu-compute/condition_register_state.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __CONDITION_REGISTER_STATE_HH__
+#define __CONDITION_REGISTER_STATE_HH__
+
+#include <string>
+#include <vector>
+
+#include "gpu-compute/misc.hh"
+
+class ComputeUnit;
+class GPUStaticInst;
+class Shader;
+class Wavefront;
+
+// Condition Register State (used only when executing HSAIL)
+class ConditionRegisterState
+{
+  public:
+    ConditionRegisterState();
+    void init(uint32_t _size);
+    const std::string name() const { return _name; }
+    void setParent(ComputeUnit *_computeUnit);
+    void regStats() { }
+
+    template<typename T>
+    T
+    read(int regIdx, int threadId)
+    {
+        bool tmp = c_reg[regIdx][threadId];
+        T *p0 = (T*)(&tmp);
+
+        return *p0;
+    }
+
+    template<typename T>
+    void
+    write(int regIdx, int threadId, T value)
+    {
+        c_reg[regIdx][threadId] = (bool)(value & 0x01);
+    }
+
+    void
+    markReg(int regIdx, uint8_t value)
+    {
+        busy.at(regIdx) = value;
+    }
+
+    uint8_t
+    regBusy(int idx)
+    {
+        uint8_t status = busy.at(idx);
+        return status;
+    }
+
+    int numRegs() { return c_reg.size(); }
+    void exec(GPUStaticInst *ii, Wavefront *w);
+
+  private:
+    ComputeUnit* computeUnit;
+    std::string _name;
+    // Condition Register state
+    std::vector<VectorMask> c_reg;
+    // flag indicating if a register is busy
+    std::vector<uint8_t> busy;
+};
+
+#endif
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
new file mode 100644
index 000000000..55e4be72a
--- /dev/null
+++ b/src/gpu-compute/dispatcher.cc
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Marc Orr
+ */
+
+
+#include "gpu-compute/dispatcher.hh"
+
+#include "cpu/base.hh"
+#include "debug/GPUDisp.hh"
+#include "gpu-compute/cl_driver.hh"
+#include "gpu-compute/cl_event.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/packet_access.hh"
+
+GpuDispatcher *GpuDispatcher::instance = nullptr;
+
+GpuDispatcher::GpuDispatcher(const Params *p)
+    : DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")),
+      pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
+      dispatchCount(0), dispatchActive(false), cpu(p->cpu),
+      shader(p->shader_pointer), driver(p->cl_driver), tickEvent(this)
+{
+    shader->handshake(this);
+    driver->handshake(this);
+
+    ndRange.wg_disp_rem = false;
+    ndRange.globalWgId = 0;
+
+    schedule(&tickEvent, 0);
+
+    // translation port for the dispatcher
+    tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
+
+    num_kernelLaunched
+    .name(name() + ".num_kernel_launched")
+    .desc("number of kernel launched")
+    ;
+}
+
+GpuDispatcher *GpuDispatcherParams::create()
+{
+    GpuDispatcher *dispatcher = new GpuDispatcher(this);
+    GpuDispatcher::setInstance(dispatcher);
+
+    return GpuDispatcher::getInstance();
+}
+
+void
+GpuDispatcher::serialize(CheckpointOut &cp) const
+{
+    Tick event_tick = 0;
+
+    if (ndRange.wg_disp_rem)
+        fatal("Checkpointing not supported during active workgroup execution");
+
+    if (tickEvent.scheduled())
+        event_tick = tickEvent.when();
+
+    SERIALIZE_SCALAR(event_tick);
+
+}
+
+void
+GpuDispatcher::unserialize(CheckpointIn &cp)
+{
+    Tick event_tick;
+
+    if (tickEvent.scheduled())
+        deschedule(&tickEvent);
+
+    UNSERIALIZE_SCALAR(event_tick);
+
+    if (event_tick)
+        schedule(&tickEvent, event_tick);
+}
+
+AddrRangeList
+GpuDispatcher::getAddrRanges() const
+{
+    AddrRangeList ranges;
+
+    DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
+            pioAddr, pioSize);
+
+    ranges.push_back(RangeSize(pioAddr, pioSize));
+
+    return ranges;
+}
+
+Tick
+GpuDispatcher::read(PacketPtr pkt)
+{
+    assert(pkt->getAddr() >= pioAddr);
+    assert(pkt->getAddr() < pioAddr + pioSize);
+
+    int offset = pkt->getAddr() - pioAddr;
+    pkt->allocate();
+
+    DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
+
+    if (offset < 8) {
+        assert(!offset);
+        assert(pkt->getSize() == 8);
+
+        uint64_t retval = dispatchActive;
+        pkt->set(retval);
+    } else {
+        offset -= 8;
+        assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
+        char *curTaskPtr = (char*)&curTask;
+
+        memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
+    }
+
+    pkt->makeAtomicResponse();
+
+    return pioDelay;
+}
+
+Tick
+GpuDispatcher::write(PacketPtr pkt)
+{
+    assert(pkt->getAddr() >= pioAddr);
+    assert(pkt->getAddr() < pioAddr + pioSize);
+
+    int offset = pkt->getAddr() - pioAddr;
+
+#if TRACING_ON
+    uint64_t data_val = 0;
+
+    switch (pkt->getSize()) {
+      case 1:
+        data_val = pkt->get<uint8_t>();
+        break;
+      case 2:
+        data_val = pkt->get<uint16_t>();
+        break;
+      case 4:
+        data_val = pkt->get<uint32_t>();
+        break;
+      case 8:
+        data_val = pkt->get<uint64_t>();
+        break;
+      default:
+        DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
+    }
+
+    DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
+            pkt->getSize());
+#endif
+    if (!offset) {
+        static int nextId = 0;
+
+        // The depends field of the qstruct, which was previously unused, is
+        // used to communicate with simulated application.
+        if (curTask.depends) {
+            HostState hs;
+            shader->ReadMem((uint64_t)(curTask.depends), &hs,
+                            sizeof(HostState), 0);
+
+            // update event start time (in nano-seconds)
+            uint64_t start = curTick() / 1000;
+
+            shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
+                             &start, sizeof(uint64_t), 0);
+        }
+
+        // launch kernel
+        ++num_kernelLaunched;
+
+        NDRange *ndr = &(ndRangeMap[nextId]);
+        // copy dispatch info
+        ndr->q = curTask;
+
+        // update the numDispTask polled by the runtime
+        accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
+
+        ndr->numWgTotal = 1;
+
+        for (int i = 0; i < 3; ++i) {
+            ndr->wgId[i] = 0;
+            ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
+            ndr->numWgTotal *= ndr->numWg[i];
+        }
+
+        ndr->numWgCompleted = 0;
+        ndr->globalWgId = 0;
+        ndr->wg_disp_rem = true;
+        ndr->execDone = false;
+        ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
+        ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
+        ndr->dispatchId = nextId;
+        ndr->curTid = pkt->req->threadId();
+        DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
+        execIds.push(nextId);
+        ++nextId;
+
+        dispatchActive = true;
+
+        if (!tickEvent.scheduled()) {
+            schedule(&tickEvent, curTick() + shader->ticks(1));
+        }
+    } else {
+        // populate current task struct
+        // first 64 bits are launch reg
+        offset -= 8;
+        assert(offset < sizeof(HsaQueueEntry));
+        char *curTaskPtr = (char*)&curTask;
+        memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
+    }
+
+    pkt->makeAtomicResponse();
+
+    return pioDelay;
+}
+
+
+BaseMasterPort&
+GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "translation_port") {
+        return *tlbPort;
+    }
+
+    return DmaDevice::getMasterPort(if_name, idx);
+}
+
+void
+GpuDispatcher::exec()
+{
+    int fail_count = 0;
+
+    // There are potentially multiple outstanding kernel launches.
+    // It is possible that the workgroups in a different kernel
+    // can fit on the GPU even if another kernel's workgroups cannot
+    DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
+
+    while (execIds.size() > fail_count) {
+        int execId = execIds.front();
+
+        while (ndRangeMap[execId].wg_disp_rem) {
+            //update the thread context
+            shader->updateThreadContext(ndRangeMap[execId].curTid);
+
+            // attempt to dispatch_workgroup
+            if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
+                // if we failed try the next kernel,
+                // it may have smaller workgroups.
+                // put it on the queue to rety latter
+                DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
+                execIds.push(execId);
+                ++fail_count;
+                break;
+            }
+        }
+        // let's try the next kernel_id
+        execIds.pop();
+    }
+
+    DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
+
+    if (doneIds.size() && cpu) {
+        shader->hostWakeUp(cpu);
+    }
+
+    while (doneIds.size()) {
+        // wakeup the CPU if any Kernels completed this cycle
+        DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
+        doneIds.pop();
+    }
+}
+
+void
+GpuDispatcher::notifyWgCompl(Wavefront *w)
+{
+    int kern_id = w->kern_id;
+    DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
+    assert(ndRangeMap[kern_id].dispatchId == kern_id);
+    ndRangeMap[kern_id].numWgCompleted++;
+
+    if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
+        ndRangeMap[kern_id].execDone = true;
+        doneIds.push(kern_id);
+
+        if (ndRangeMap[kern_id].addrToNotify) {
+            accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
+                          0);
+        }
+
+        accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
+
+        // update event end time (in nano-seconds)
+        if (ndRangeMap[kern_id].q.depends) {
+            HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
+            uint64_t event;
+            shader->ReadMem((uint64_t)(&host_state->event), &event,
+                            sizeof(uint64_t), 0);
+
+            uint64_t end = curTick() / 1000;
+
+            shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
+                             sizeof(uint64_t), 0);
+        }
+    }
+
+    if (!tickEvent.scheduled()) {
+        schedule(&tickEvent, curTick() + shader->ticks(1));
+    }
+}
+
+void
+GpuDispatcher::scheduleDispatch()
+{
+    if (!tickEvent.scheduled())
+        schedule(&tickEvent, curTick() + shader->ticks(1));
+}
+
+void
+GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
+{
+    if (cpu) {
+        if (off) {
+            shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
+                              true);
+            val += off;
+        }
+
+        shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
+    } else {
+        panic("Cannot find host");
+    }
+}
+
+GpuDispatcher::TickEvent::TickEvent(GpuDispatcher *_dispatcher)
+    : Event(CPU_Tick_Pri), dispatcher(_dispatcher)
+{
+}
+
+void
+GpuDispatcher::TickEvent::process()
+{
+    dispatcher->exec();
+}
+
+const char*
+GpuDispatcher::TickEvent::description() const
+{
+    return "GPU Dispatcher tick";
+}
+
+// helper functions for driver to retrieve GPU attributes
+int
+GpuDispatcher::getNumCUs()
+{
+    return shader->cuList.size();
+}
+
+void
+GpuDispatcher::setFuncargsSize(int funcargs_size)
+{
+    shader->funcargs_size = funcargs_size;
+}
diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh
new file mode 100644
index 000000000..76f932655
--- /dev/null
+++ b/src/gpu-compute/dispatcher.hh
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Marc Orr
+ */
+
+#ifndef __GPU_DISPATCHER_HH__
+#define __GPU_DISPATCHER_HH__
+
+#include <queue>
+#include <vector>
+
+#include "base/statistics.hh"
+#include "dev/dma_device.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/ndrange.hh"
+#include "gpu-compute/qstruct.hh"
+#include "mem/port.hh"
+#include "params/GpuDispatcher.hh"
+
+class BaseCPU;
+class Shader;
+
+class GpuDispatcher : public DmaDevice
+{
+    public:
+        typedef GpuDispatcherParams Params;
+
+        class TickEvent : public Event
+        {
+            private:
+                GpuDispatcher *dispatcher;
+
+            public:
+                TickEvent(GpuDispatcher *);
+                void process();
+                const char *description() const;
+        };
+
+        MasterID masterId() { return _masterId; }
+
+    protected:
+        MasterID _masterId;
+
+        // Base and length of PIO register space
+        Addr pioAddr;
+        Addr pioSize;
+        Tick pioDelay;
+
+        HsaQueueEntry curTask;
+
+        std::unordered_map<int, NDRange> ndRangeMap;
+        NDRange ndRange;
+
+        // list of kernel_ids to launch
+        std::queue<int> execIds;
+        // list of kernel_ids that have finished
+        std::queue<int> doneIds;
+
+        uint64_t dispatchCount;
+        // is there a kernel in execution?
+        bool dispatchActive;
+
+        BaseCPU *cpu;
+        Shader *shader;
+        ClDriver *driver;
+        TickEvent tickEvent;
+
+        static GpuDispatcher *instance;
+
+        // sycall emulation mode can have only 1 application running(?)
+        // else we have to do some pid based tagging
+        // unused
+        typedef std::unordered_map<uint64_t, uint64_t> TranslationBuffer;
+        TranslationBuffer tlb;
+
+    public:
+        /*statistics*/
+        Stats::Scalar num_kernelLaunched;
+        GpuDispatcher(const Params *p);
+
+        ~GpuDispatcher() { }
+
+        void exec();
+        virtual void serialize(CheckpointOut &cp) const;
+        virtual void unserialize(CheckpointIn &cp);
+        void notifyWgCompl(Wavefront *w);
+        void scheduleDispatch();
+        void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off);
+
+        // using singleton so that glue code can pass pointer locations
+        // to the dispatcher. when there are multiple dispatchers, we can
+        // call something like getInstance(index)
+        static void
+         setInstance(GpuDispatcher *_instance)
+        {
+            instance = _instance;
+        }
+
+        static GpuDispatcher* getInstance() { return instance; }
+
+        class TLBPort : public MasterPort
+        {
+          public:
+
+            TLBPort(const std::string &_name, GpuDispatcher *_dispatcher)
+                : MasterPort(_name, _dispatcher), dispatcher(_dispatcher) { }
+
+          protected:
+            GpuDispatcher *dispatcher;
+
+            virtual bool recvTimingResp(PacketPtr pkt) { return true; }
+            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+            virtual void recvFunctional(PacketPtr pkt) { }
+            virtual void recvRangeChange() { }
+            virtual void recvReqRetry() { }
+
+        };
+
+        TLBPort *tlbPort;
+
+        virtual BaseMasterPort& getMasterPort(const std::string &if_name,
+                                              PortID idx);
+
+        AddrRangeList getAddrRanges() const;
+        Tick read(PacketPtr pkt);
+        Tick write(PacketPtr pkt);
+
+        // helper functions to retrieve/set GPU attributes
+        int getNumCUs();
+        void setFuncargsSize(int funcargs_size);
+};
+
+#endif // __GPU_DISPATCHER_HH__
diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc
new file mode 100644
index 000000000..c2b95f85e
--- /dev/null
+++ b/src/gpu-compute/exec_stage.cc
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#include "gpu-compute/exec_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/wavefront.hh"
+
+ExecStage::ExecStage(const ComputeUnitParams *p) : numSIMDs(p->num_SIMDs),
+    numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
+    vectorAluInstAvail(nullptr), glbMemInstAvail(nullptr),
+    shrMemInstAvail(nullptr), lastTimeInstExecuted(false),
+    thisTimeInstExecuted(false), instrExecuted (false),
+    executionResourcesUsed(0)
+{
+    numTransActiveIdle = 0;
+    idle_dur = 0;
+}
+
+void
+ExecStage::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    _name = computeUnit->name() + ".ExecStage";
+    dispatchList = &computeUnit->dispatchList;
+    vectorAluInstAvail = &(computeUnit->vectorAluInstAvail);
+    glbMemInstAvail= &(computeUnit->glbMemInstAvail);
+    shrMemInstAvail= &(computeUnit->shrMemInstAvail);
+    idle_dur = 0;
+}
+
+void
+ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
+    if (stage == IdleExec) {
+        // count cycles of no vector ALU instruction executed
+        // even if one was the oldest in a WV of that vector SIMD unit
+        if (computeUnit->isVecAlu(unitId) && vectorAluInstAvail->at(unitId)) {
+            numCyclesWithNoInstrTypeIssued[unitId]++;
+        }
+
+        // count cycles of no global memory (vector) instruction executed
+        // even if one was the oldest in a WV of that vector SIMD unit
+        if (computeUnit->isGlbMem(unitId) && *glbMemInstAvail > 0) {
+            numCyclesWithNoInstrTypeIssued[unitId]++;
+            (*glbMemInstAvail)--;
+        }
+
+        // count cycles of no shared memory (vector) instruction executed
+        // even if one was the oldest in a WV of that vector SIMD unit
+        if (computeUnit->isShrMem(unitId) && *shrMemInstAvail > 0) {
+            numCyclesWithNoInstrTypeIssued[unitId]++;
+            (*shrMemInstAvail)--;
+        }
+    } else if (stage == BusyExec) {
+        // count the number of cycles an instruction to a specific unit
+        // was issued
+        numCyclesWithInstrTypeIssued[unitId]++;
+        thisTimeInstExecuted = true;
+        instrExecuted = true;
+        ++executionResourcesUsed;
+    } else if (stage == PostExec) {
+        // count the number of transitions from active to idle
+        if (lastTimeInstExecuted && !thisTimeInstExecuted) {
+            ++numTransActiveIdle;
+        }
+
+        if (!lastTimeInstExecuted && thisTimeInstExecuted) {
+            idleDur.sample(idle_dur);
+            idle_dur = 0;
+        } else if (!thisTimeInstExecuted) {
+            idle_dur++;
+        }
+
+        lastTimeInstExecuted = thisTimeInstExecuted;
+        // track the number of cycles we either issued one vector instruction
+        // or issued no instructions at all
+        if (instrExecuted) {
+            numCyclesWithInstrIssued++;
+        } else {
+            numCyclesWithNoIssue++;
+        }
+
+        spc.sample(executionResourcesUsed);
+    }
+}
+
+void
+ExecStage::initStatistics()
+{
+    instrExecuted = false;
+    executionResourcesUsed = 0;
+    thisTimeInstExecuted = false;
+}
+
+void
+ExecStage::exec()
+{
+    initStatistics();
+
+    for (int unitId = 0; unitId < (numSIMDs + numMemUnits); ++unitId) {
+         // if dispatch list for this execution resource is empty,
+         // skip this execution resource this cycle
+         if (dispatchList->at(unitId).second == EMPTY) {
+             collectStatistics(IdleExec, unitId);
+             continue;
+         }
+
+         collectStatistics(BusyExec, unitId);
+         // execute an instruction for the WF
+         dispatchList->at(unitId).first->exec();
+         // clear the dispatch list entry
+         dispatchList->at(unitId).second = EMPTY;
+         dispatchList->at(unitId).first = (Wavefront*)nullptr;
+    }
+
+    collectStatistics(PostExec, 0);
+}
+
+void
+ExecStage::regStats()
+{
+    numTransActiveIdle
+       .name(name() + ".num_transitions_active_to_idle")
+       .desc("number of CU transitions from active to idle")
+        ;
+
+    numCyclesWithNoIssue
+        .name(name() + ".num_cycles_with_no_issue")
+        .desc("number of cycles the CU issues nothing")
+        ;
+
+    numCyclesWithInstrIssued
+        .name(name() + ".num_cycles_with_instr_issued")
+        .desc("number of cycles the CU issued at least one instruction")
+        ;
+
+    spc
+        .init(0, numSIMDs + numMemUnits, 1)
+        .name(name() + ".spc")
+        .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
+        ;
+
+    idleDur
+        .init(0,75,5)
+        .name(name() + ".idle_duration_in_cycles")
+        .desc("duration of idle periods in cycles")
+        ;
+
+    numCyclesWithInstrTypeIssued
+        .init(numSIMDs + numMemUnits)
+        .name(name() + ".num_cycles_with_instrtype_issue")
+        .desc("Number of cycles at least one instruction of specific type "
+              "issued")
+        ;
+
+    numCyclesWithNoInstrTypeIssued
+        .init(numSIMDs + numMemUnits)
+       .name(name() + ".num_cycles_with_instr_type_no_issue")
+       .desc("Number of cycles no instruction of specific type issued")
+       ;
+
+    for (int i = 0; i < numSIMDs; ++i) {
+        numCyclesWithInstrTypeIssued.subname(i, csprintf("ALU%d",i));
+        numCyclesWithNoInstrTypeIssued.subname(i, csprintf("ALU%d",i));
+    }
+
+    numCyclesWithInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
+    numCyclesWithNoInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
+    numCyclesWithInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
+    numCyclesWithNoInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
+}
diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh
new file mode 100644
index 000000000..2de74366b
--- /dev/null
+++ b/src/gpu-compute/exec_stage.hh
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#ifndef __EXEC_STAGE_HH__
+#define __EXEC_STAGE_HH__
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "sim/stats.hh"
+
+class ComputeUnit;
+class Wavefront;
+struct ComputeUnitParams;
+
+enum STAT_STATUS
+{
+    IdleExec,
+    BusyExec,
+    PostExec
+};
+
+enum DISPATCH_STATUS
+{
+    EMPTY = 0,
+    FILLED
+};
+
+// Execution stage.
+// Each execution resource executes the
+// wave which is in its dispatch list.
+// The schedule stage is responsible for
+// adding a wave into each execution resource's
+// dispatch list.
+
+class ExecStage
+{
+  public:
+    ExecStage(const ComputeUnitParams* params);
+    ~ExecStage() { }
+    void init(ComputeUnit *cu);
+    void exec();
+
+    std::string name() { return _name; }
+    void regStats();
+    // number of idle cycles
+    Stats::Scalar numCyclesWithNoIssue;
+    // number of busy cycles
+    Stats::Scalar numCyclesWithInstrIssued;
+    // number of cycles (per execution unit) during which at least one
+    // instruction was issued to that unit
+    Stats::Vector numCyclesWithInstrTypeIssued;
+    // number of idle cycles (per execution unit) during which the unit issued
+    // no instruction targeting that unit, even though there is at least one
+    // Wavefront with such an instruction as the oldest
+    Stats::Vector numCyclesWithNoInstrTypeIssued;
+    // SIMDs active per cycle
+    Stats::Distribution spc;
+
+  private:
+    void collectStatistics(enum STAT_STATUS stage, int unitId);
+    void initStatistics();
+    ComputeUnit *computeUnit;
+    uint32_t numSIMDs;
+
+    // Number of memory execution resources;
+    // both global and local memory execution resources in CU
+    uint32_t numMemUnits;
+
+    // List of waves which will be dispatched to
+    // each execution resource. A FILLED implies
+    // dispatch list is non-empty and
+    // execution unit has something to execute
+    // this cycle. Currently, the dispatch list of
+    // an execution resource can hold only one wave because
+    // an execution resource can execute only one wave in a cycle.
+    // dispatchList is used to communicate between schedule
+    // and exec stage
+    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
+    // flag per vector SIMD unit that is set when there is at least one
+    // WV that has a vector ALU instruction as the oldest in its
+    // Instruction Buffer
+    std::vector<bool> *vectorAluInstAvail;
+    int *glbMemInstAvail;
+    int *shrMemInstAvail;
+    bool lastTimeInstExecuted;
+    bool thisTimeInstExecuted;
+    bool instrExecuted;
+    Stats::Scalar  numTransActiveIdle;
+    Stats::Distribution idleDur;
+    uint32_t executionResourcesUsed;
+    uint64_t idle_dur;
+    std::string _name;
+};
+
+#endif // __EXEC_STAGE_HH__
diff --git a/src/gpu-compute/fetch_stage.cc b/src/gpu-compute/fetch_stage.cc
new file mode 100644
index 000000000..1f5e6ded3
--- /dev/null
+++ b/src/gpu-compute/fetch_stage.cc
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez, Sooraj Puthoor
+ */
+
+#include "gpu-compute/fetch_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/wavefront.hh"
+
+FetchStage::FetchStage(const ComputeUnitParams* p) : numSIMDs(p->num_SIMDs),
+    computeUnit(nullptr)
+{
+    for (int j = 0; j < numSIMDs; ++j) {
+        FetchUnit newFetchUnit(p);
+        fetchUnit.push_back(newFetchUnit);
+    }
+}
+
+FetchStage::~FetchStage()
+{
+    fetchUnit.clear();
+}
+
+void
+FetchStage::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    _name = computeUnit->name() + ".FetchStage";
+
+    for (int j = 0; j < numSIMDs; ++j) {
+        fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
+        fetchUnit[j].init(computeUnit);
+    }
+}
+
+void
+FetchStage::exec()
+{
+    for (int j = 0; j < numSIMDs; ++j) {
+        fetchUnit[j].exec();
+    }
+}
+
+void
+FetchStage::processFetchReturn(PacketPtr pkt)
+{
+    ComputeUnit::SQCPort::SenderState *sender_state =
+        safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
+
+    Wavefront *wavefront = sender_state->wavefront;
+
+    const unsigned num_instructions = pkt->req->getSize() /
+        sizeof(TheGpuISA::RawMachInst);
+
+    instFetchInstReturned.sample(num_instructions);
+    uint32_t simdId = wavefront->simdId;
+    fetchUnit[simdId].processFetchReturn(pkt);
+}
+
+void
+FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
+{
+    fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
+}
+
+void
+FetchStage::regStats()
+{
+    instFetchInstReturned
+        .init(1, 32, 1)
+        .name(name() + ".inst_fetch_instr_returned")
+        .desc("For each instruction fetch request recieved record how many "
+              "instructions you got from it")
+        ;
+}
diff --git a/src/gpu-compute/fetch_stage.hh b/src/gpu-compute/fetch_stage.hh
new file mode 100644
index 000000000..ce7faa8ac
--- /dev/null
+++ b/src/gpu-compute/fetch_stage.hh
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez, Sooraj Puthoor
+ */
+
+#ifndef __FETCH_STAGE_HH__
+#define __FETCH_STAGE_HH__
+
+#include <string>
+#include <vector>
+
+#include "gpu-compute/fetch_unit.hh"
+
+// Instruction fetch stage.
+// All dispatched wavefronts for all SIMDS are analyzed for the
+// need to fetch instructions. From the fetch eligible waves,
+// one wave is selected from each SIMD and fetch is initiated
+// for the selected waves.
+
+class ComputeUnit;
+class Wavefront;
+
+class FetchStage
+{
+  public:
+    FetchStage(const ComputeUnitParams* params);
+    ~FetchStage();
+    void init(ComputeUnit *cu);
+    void exec();
+    void processFetchReturn(PacketPtr pkt);
+    void fetch(PacketPtr pkt, Wavefront *wave);
+
+    // Stats related variables and methods
+    std::string name() { return _name; }
+    void regStats();
+    Stats::Distribution instFetchInstReturned;
+
+  private:
+    uint32_t numSIMDs;
+    ComputeUnit *computeUnit;
+
+    // List of fetch units. A fetch unit is
+    // instantiated per SIMD
+    std::vector<FetchUnit> fetchUnit;
+    std::string _name;
+};
+
+#endif // __FETCH_STAGE_HH__
diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc
new file mode 100644
index 000000000..1f0a7d78e
--- /dev/null
+++ b/src/gpu-compute/fetch_unit.cc
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Sooraj Puthoor
+ */
+
+#include "gpu-compute/fetch_unit.hh"
+
+#include "debug/GPUFetch.hh"
+#include "debug/GPUPort.hh"
+#include "debug/GPUTLB.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/ruby/system/RubySystem.hh"
+
+uint32_t FetchUnit::globalFetchUnitID;
+
+FetchUnit::FetchUnit(const ComputeUnitParams* params) :
+    timingSim(true),
+    computeUnit(nullptr),
+    fetchScheduler(params),
+    waveList(nullptr)
+{
+}
+
+FetchUnit::~FetchUnit()
+{
+    fetchQueue.clear();
+    fetchStatusQueue.clear();
+}
+
+void
+FetchUnit::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    timingSim = computeUnit->shader->timingSim;
+    fetchQueue.clear();
+    fetchStatusQueue.resize(computeUnit->shader->n_wf);
+
+    for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
+        fetchStatusQueue[j] = std::make_pair(waveList->at(j), false);
+    }
+
+    fetchScheduler.bindList(&fetchQueue);
+}
+
+void
+FetchUnit::exec()
+{
+    // re-evaluate waves which are marked as not ready for fetch
+    for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
+        // Following code assumes 64-bit opertaion and all insts are
+        // represented by 64-bit pointers to inst objects.
+        Wavefront *curWave = fetchStatusQueue[j].first;
+        assert (curWave);
+
+        // The wavefront has to be active, the IB occupancy has to be
+        // 4 or less instructions and it can not have any branches to
+        // prevent speculative instruction fetches
+        if (!fetchStatusQueue[j].second) {
+            if (curWave->status == Wavefront::S_RUNNING &&
+                curWave->instructionBuffer.size() <= 4 &&
+                !curWave->instructionBufferHasBranch() &&
+                !curWave->pendingFetch) {
+                fetchQueue.push_back(curWave);
+                fetchStatusQueue[j].second = true;
+            }
+        }
+    }
+
+    // Fetch only if there is some wave ready to be fetched
+    // An empty fetchQueue will cause the schedular to panic
+    if (fetchQueue.size()) {
+        Wavefront *waveToBeFetched = fetchScheduler.chooseWave();
+        waveToBeFetched->pendingFetch = true;
+        fetchStatusQueue[waveToBeFetched->wfSlotId].second = false;
+        initiateFetch(waveToBeFetched);
+    }
+}
+
+void
+FetchUnit::initiateFetch(Wavefront *wavefront)
+{
+    // calculate the virtual address to fetch from the SQC
+    Addr vaddr = wavefront->pc() + wavefront->instructionBuffer.size();
+    vaddr = wavefront->base_ptr +  vaddr * sizeof(GPUStaticInst*);
+
+    DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
+            computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
+
+    // Since this is an instruction prefetch, if you're split then just finish
+    // out the current line.
+    unsigned block_size = RubySystem::getBlockSizeBytes();
+    // check for split accesses
+    Addr split_addr = roundDown(vaddr + block_size - 1, block_size);
+    unsigned size = block_size;
+
+    if (split_addr > vaddr) {
+        // misaligned access, just grab the rest of the line
+        size = split_addr - vaddr;
+    }
+
+    // set up virtual request
+    Request *req = new Request(0, vaddr, size, Request::INST_FETCH,
+                               computeUnit->masterId(), 0, 0, 0);
+
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    // This fetchBlock is kind of faux right now - because the translations so
+    // far don't actually return Data
+    uint64_t fetchBlock;
+    pkt->dataStatic(&fetchBlock);
+
+    if (timingSim) {
+        // SenderState needed on Return
+        pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront);
+
+        // Sender State needed by TLB hierarchy
+        pkt->senderState =
+            new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
+                                                 computeUnit->shader->gpuTc,
+                                                 false, pkt->senderState);
+
+        if (computeUnit->sqcTLBPort->isStalled()) {
+            assert(computeUnit->sqcTLBPort->retries.size() > 0);
+
+            DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
+                    vaddr);
+
+            computeUnit->sqcTLBPort->retries.push_back(pkt);
+        } else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) {
+            // Stall the data port;
+            // No more packet is issued till
+            // ruby indicates resources are freed by
+            // a recvReqRetry() call back on this port.
+            computeUnit->sqcTLBPort->stallPort();
+
+            DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
+                    vaddr);
+
+            computeUnit->sqcTLBPort->retries.push_back(pkt);
+        } else {
+            DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr);
+        }
+    } else {
+        pkt->senderState =
+            new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
+                                                 computeUnit->shader->gpuTc);
+
+        computeUnit->sqcTLBPort->sendFunctional(pkt);
+
+        TheISA::GpuTLB::TranslationState *sender_state =
+             safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+        delete sender_state->tlbEntry;
+        delete sender_state;
+        // fetch the instructions from the SQC when we operate in
+        // functional mode only
+        fetch(pkt, wavefront);
+    }
+}
+
+void
+FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
+{
+    assert(pkt->req->hasPaddr());
+    assert(pkt->req->hasSize());
+
+    DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n",
+            computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+            pkt->req->getPaddr());
+
+    // this is necessary because the GPU TLB receives packets instead of
+    // requests. when the translation is complete, all relevent fields in the
+    // request will be populated, but not in the packet. here we create the
+    // new packet so we can set the size, addr, and proper flags.
+    PacketPtr oldPkt = pkt;
+    pkt = new Packet(oldPkt->req, oldPkt->cmd);
+    delete oldPkt;
+
+    TheGpuISA::RawMachInst *data =
+        new TheGpuISA::RawMachInst[pkt->req->getSize() /
+        sizeof(TheGpuISA::RawMachInst)];
+
+    pkt->dataDynamic<TheGpuISA::RawMachInst>(data);
+
+    // New SenderState for the memory access
+    pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
+
+    if (timingSim) {
+        // translation is done. Send the appropriate timing memory request.
+
+        if (!computeUnit->sqcPort->sendTimingReq(pkt)) {
+            computeUnit->sqcPort->retries.push_back(std::make_pair(pkt,
+                                                                   wavefront));
+
+            DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
+                    computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+                    pkt->req->getPaddr());
+        } else {
+            DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
+                    computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+                    pkt->req->getPaddr());
+        }
+    } else {
+        computeUnit->sqcPort->sendFunctional(pkt);
+        processFetchReturn(pkt);
+    }
+}
+
+void
+FetchUnit::processFetchReturn(PacketPtr pkt)
+{
+    ComputeUnit::SQCPort::SenderState *sender_state =
+        safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
+
+    Wavefront *wavefront = sender_state->wavefront;
+
+    DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
+            "%d bytes, %d instructions!\n", computeUnit->cu_id,
+            wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(),
+            pkt->req->getSize(), pkt->req->getSize() /
+            sizeof(TheGpuISA::RawMachInst));
+
+    if (wavefront->dropFetch) {
+        assert(wavefront->instructionBuffer.empty());
+        wavefront->dropFetch = false;
+    } else {
+        TheGpuISA::RawMachInst *inst_index_ptr =
+            (TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>();
+
+        assert(wavefront->instructionBuffer.size() <= 4);
+
+        for (int i = 0; i < pkt->req->getSize() /
+             sizeof(TheGpuISA::RawMachInst); ++i) {
+            GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]);
+
+            assert(inst_ptr);
+            DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n",
+                    computeUnit->cu_id, wavefront->simdId,
+                    wavefront->wfSlotId, inst_ptr->disassemble());
+
+            GPUDynInstPtr gpuDynInst =
+                std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr,
+                                             computeUnit->getAndIncSeqNum());
+
+            wavefront->instructionBuffer.push_back(gpuDynInst);
+        }
+    }
+
+    wavefront->pendingFetch = false;
+
+    delete pkt->senderState;
+    delete pkt->req;
+    delete pkt;
+}
+
+void
+FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
+{
+    waveList = wave_list;
+}
diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh
new file mode 100644
index 000000000..c7c6afb3c
--- /dev/null
+++ b/src/gpu-compute/fetch_unit.hh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Sooraj Puthoor
+ */
+
+#ifndef __FETCH_UNIT_HH__
+#define __FETCH_UNIT_HH__
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arch/gpu_decoder.hh"
+#include "base/statistics.hh"
+#include "config/the_gpu_isa.hh"
+#include "gpu-compute/scheduler.hh"
+#include "mem/packet.hh"
+
+class ComputeUnit;
+class Wavefront;
+
+class FetchUnit
+{
+  public:
+    FetchUnit(const ComputeUnitParams* params);
+    ~FetchUnit();
+    void init(ComputeUnit *cu);
+    void exec();
+    void bindWaveList(std::vector<Wavefront*> *list);
+    void initiateFetch(Wavefront *wavefront);
+    void fetch(PacketPtr pkt, Wavefront *wavefront);
+    void processFetchReturn(PacketPtr pkt);
+    static uint32_t globalFetchUnitID;
+
+  private:
+    bool timingSim;
+    ComputeUnit *computeUnit;
+    TheGpuISA::Decoder decoder;
+
+    // Fetch scheduler; Selects one wave from
+    // the fetch queue for instruction fetching.
+    // The selection is made according to
+    // a scheduling policy
+    Scheduler fetchScheduler;
+
+    // Stores the list of waves that are
+    // ready to be fetched this cycle
+    std::vector<Wavefront*> fetchQueue;
+
+    // Stores the fetch status of all waves dispatched to this SIMD.
+    // TRUE implies the wave is ready to fetch and is already
+    // moved to fetchQueue
+    std::vector<std::pair<Wavefront*, bool>> fetchStatusQueue;
+
+    // Pointer to list of waves dispatched on to this SIMD unit
+    std::vector<Wavefront*> *waveList;
+};
+
+#endif // __FETCH_UNIT_HH__
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
new file mode 100644
index 000000000..913327412
--- /dev/null
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#include "gpu-compute/global_memory_pipeline.hh"
+
+#include "debug/GPUMem.hh"
+#include "debug/GPUReg.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
+    computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
+    inflightStores(0), inflightLoads(0)
+{
+}
+
+void
+GlobalMemPipeline::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    globalMemSize = computeUnit->shader->globalMemSize;
+    _name = computeUnit->name() + ".GlobalMemPipeline";
+}
+
+void
+GlobalMemPipeline::exec()
+{
+    // apply any returned global memory operations
+    GPUDynInstPtr m = !gmReturnedLoads.empty() ? gmReturnedLoads.front() :
+        !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr;
+
+    bool accessVrf = true;
+    // check the VRF to see if the operands of a load (or load component
+    // of an atomic) are accessible
+    if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) {
+        Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+        accessVrf =
+            w->computeUnit->vrf[m->simdId]->
+            vrfOperandAccessReady(m->seqNum(), w, m,
+                                  VrfAccessType::WRITE);
+    }
+
+    if ((!gmReturnedStores.empty() || !gmReturnedLoads.empty()) &&
+        m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
+        accessVrf && m->statusBitVector == VectorMask(0) &&
+        (computeUnit->shader->coissue_return ||
+         computeUnit->wfWait.at(m->pipeId).rdy())) {
+
+        if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
+            doGmReturn<uint32_t, uint8_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
+            doGmReturn<uint32_t, uint16_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
+            doGmReturn<uint32_t, uint32_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
+            doGmReturn<int32_t, int8_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
+            doGmReturn<int32_t, int16_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
+            doGmReturn<int32_t, int32_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
+            doGmReturn<float, Float16>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
+            doGmReturn<float, float>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
+            doGmReturn<uint64_t, uint8_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
+            doGmReturn<uint64_t, uint16_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
+            doGmReturn<uint64_t, uint32_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
+            doGmReturn<uint64_t, uint64_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
+            doGmReturn<int64_t, int8_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
+            doGmReturn<int64_t, int16_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
+            doGmReturn<int64_t, int32_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
+            doGmReturn<int64_t, int64_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
+            doGmReturn<double, Float16>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
+            doGmReturn<double, float>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
+            doGmReturn<double, double>(m);
+    }
+
+    // If pipeline has executed a global memory instruction
+    // execute global memory packets and issue global
+    // memory packets to DTLB
+    if (!gmIssuedRequests.empty()) {
+        GPUDynInstPtr mp = gmIssuedRequests.front();
+        if (mp->m_op == Enums::MO_LD ||
+            (mp->m_op >= Enums::MO_AAND && mp->m_op <= Enums::MO_AMIN) ||
+            (mp->m_op >= Enums::MO_ANRAND && mp->m_op <= Enums::MO_ANRMIN)) {
+
+            if (inflightLoads >= gmQueueSize) {
+                return;
+            } else {
+                ++inflightLoads;
+            }
+        } else {
+            if (inflightStores >= gmQueueSize) {
+                return;
+            } else {
+                ++inflightStores;
+            }
+        }
+
+        mp->initiateAcc(mp);
+        gmIssuedRequests.pop();
+
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = %s\n",
+                computeUnit->cu_id, mp->simdId, mp->wfSlotId,
+                Enums::MemOpTypeStrings[mp->m_op]);
+    }
+}
+
+template<typename c0, typename c1>
+void
+GlobalMemPipeline::doGmReturn(GPUDynInstPtr m)
+{
+    Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+    // Return data to registers
+    if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
+        gmReturnedLoads.pop();
+        assert(inflightLoads > 0);
+        --inflightLoads;
+
+        if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) {
+            std::vector<uint32_t> regVec;
+            // iterate over number of destination register operands since
+            // this is a load or atomic operation
+            for (int k = 0; k < m->n_reg; ++k) {
+                assert((sizeof(c1) * m->n_reg) <= MAX_WIDTH_FOR_MEM_INST);
+                int dst = m->dst_reg + k;
+
+                if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
+                    dst = m->dst_reg_vec[k];
+                // virtual->physical VGPR mapping
+                int physVgpr = w->remap(dst, sizeof(c0), 1);
+                // save the physical VGPR index
+                regVec.push_back(physVgpr);
+                c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+
+                for (int i = 0; i < VSZ; ++i) {
+                    if (m->exec_mask[i]) {
+                        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
+                                "$%s%d <- %d global ld done (src = wavefront "
+                                "ld inst)\n", w->computeUnit->cu_id, w->simdId,
+                                w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
+                                dst, *p1);
+                        // write the value into the physical VGPR. This is a
+                        // purely functional operation. No timing is modeled.
+                        w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
+                                                                    *p1, i);
+                    }
+                    ++p1;
+                }
+            }
+
+            // Schedule the write operation of the load data on the VRF.
+            // This simply models the timing aspect of the VRF write operation.
+            // It does not modify the physical VGPR.
+            loadVrfBankConflictCycles +=
+                w->computeUnit->vrf[w->simdId]->exec(m->seqNum(),
+                                                     w, regVec, sizeof(c0),
+                                                     m->time);
+        }
+    } else {
+        gmReturnedStores.pop();
+        assert(inflightStores > 0);
+        --inflightStores;
+    }
+
+    // Decrement outstanding register count
+    computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1);
+
+    if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) ||
+        MO_H(m->m_op)) {
+        computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_gm, m->time,
+                                         -1);
+    }
+
+    if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
+        computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_gm, m->time,
+                                         -1);
+    }
+
+    // Mark write bus busy for appropriate amount of time
+    computeUnit->glbMemToVrfBus.set(m->time);
+    if (!computeUnit->shader->coissue_return)
+        w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+}
+
+void
+GlobalMemPipeline::regStats()
+{
+    loadVrfBankConflictCycles
+        .name(name() + ".load_vrf_bank_conflict_cycles")
+        .desc("total number of cycles GM data are delayed before updating "
+              "the VRF")
+        ;
+}
diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh
new file mode 100644
index 000000000..ed49f6f6b
--- /dev/null
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#ifndef __GLOBAL_MEMORY_PIPELINE_HH__
+#define __GLOBAL_MEMORY_PIPELINE_HH__
+
+#include <queue>
+#include <string>
+
+#include "gpu-compute/misc.hh"
+#include "params/ComputeUnit.hh"
+#include "sim/stats.hh"
+
+/*
+ * @file global_memory_pipeline.hh
+ *
+ * The global memory pipeline issues newly created global memory packets
+ * from the pipeline to DTLB. The exec() method of the memory packet issues
+ * the packet to the DTLB if there is space available in the return fifo.
+ * This stage also retires previously issued loads and stores that have
+ * returned from the memory sub-system.
+ */
+
+class ComputeUnit;
+
+class GlobalMemPipeline
+{
+  public:
+    GlobalMemPipeline(const ComputeUnitParams *params);
+    void init(ComputeUnit *cu);
+    void exec();
+
+    template<typename c0, typename c1> void doGmReturn(GPUDynInstPtr m);
+
+    std::queue<GPUDynInstPtr> &getGMReqFIFO() { return gmIssuedRequests; }
+    std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
+    std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
+
+    bool
+    isGMLdRespFIFOWrRdy() const
+    {
+        return gmReturnedLoads.size() < gmQueueSize;
+    }
+
+    bool
+    isGMStRespFIFOWrRdy() const
+    {
+        return gmReturnedStores.size() < gmQueueSize;
+    }
+
+    bool
+    isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
+    {
+        return (gmIssuedRequests.size() + pendReqs) < gmQueueSize;
+    }
+
+    const std::string &name() const { return _name; }
+    void regStats();
+
+  private:
+    ComputeUnit *computeUnit;
+    std::string _name;
+    int gmQueueSize;
+
+    // number of cycles of delaying the update of a VGPR that is the
+    // target of a load instruction (or the load component of an atomic)
+    // The delay is due to VRF bank conflicts
+    Stats::Scalar loadVrfBankConflictCycles;
+    // Counters to track the inflight loads and stores
+    // so that we can provide the proper backpressure
+    // on the number of inflight memory operations.
+    int inflightStores;
+    int inflightLoads;
+
+    // The size of global memory.
+    int globalMemSize;
+
+    // Global Memory Request FIFO: all global memory requests
+    // are issued to this FIFO from the memory pipelines
+    std::queue<GPUDynInstPtr> gmIssuedRequests;
+
+    // Globa Store Response FIFO: all responses of global memory
+    // stores are sent to this FIFO from TCP
+    std::queue<GPUDynInstPtr> gmReturnedStores;
+
+    // Global Load Response FIFO: all responses of global memory
+    // loads are sent to this FIFO from TCP
+    std::queue<GPUDynInstPtr> gmReturnedLoads;
+};
+
+#endif // __GLOBAL_MEMORY_PIPELINE_HH__
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
new file mode 100644
index 000000000..83e348dbe
--- /dev/null
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_dyn_inst.hh"
+
+#include "debug/GPUMem.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+
+GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
+                       GPUStaticInst *_staticInst, uint64_t instSeqNum)
+    : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF),
+      memoryOrder(Enums::MEMORY_ORDER_NONE), useContinuation(false),
+      statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum)
+{
+    tlbHitLevel.assign(VSZ, -1);
+}
+
+void
+GPUDynInst::execute()
+{
+    GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(cu, wf, staticInst,
+                                                            _seqNum);
+    staticInst->execute(gpuDynInst);
+}
+
+int
+GPUDynInst::numSrcRegOperands()
+{
+    return staticInst->numSrcRegOperands();
+}
+
+int
+GPUDynInst::numDstRegOperands()
+{
+    return staticInst->numDstRegOperands();
+}
+
+int
+GPUDynInst::getNumOperands()
+{
+    return staticInst->getNumOperands();
+}
+
+bool
+GPUDynInst::isVectorRegister(int operandIdx)
+{
+    return staticInst->isVectorRegister(operandIdx);
+}
+
+bool
+GPUDynInst::isScalarRegister(int operandIdx)
+{
+    return staticInst->isVectorRegister(operandIdx);
+}
+
+int
+GPUDynInst::getRegisterIndex(int operandIdx)
+{
+    return staticInst->getRegisterIndex(operandIdx);
+}
+
+int
+GPUDynInst::getOperandSize(int operandIdx)
+{
+    return staticInst->getOperandSize(operandIdx);
+}
+
+bool
+GPUDynInst::isDstOperand(int operandIdx)
+{
+    return staticInst->isDstOperand(operandIdx);
+}
+
+bool
+GPUDynInst::isSrcOperand(int operandIdx)
+{
+    return staticInst->isSrcOperand(operandIdx);
+}
+
+bool
+GPUDynInst::isArgLoad()
+{
+    return staticInst->isArgLoad();
+}
+
+const std::string&
+GPUDynInst::disassemble() const
+{
+    return staticInst->disassemble();
+}
+
+uint64_t
+GPUDynInst::seqNum() const
+{
+    return _seqNum;
+}
+
+Enums::OpType
+GPUDynInst::opType()
+{
+    return staticInst->o_type;
+}
+
+Enums::StorageClassType
+GPUDynInst::executedAs()
+{
+    return staticInst->executed_as;
+}
+
+// Process a memory instruction and (if necessary) submit timing request
+void
+GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
+{
+    DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n",
+            cu->cu_id, simdId, wfSlotId, exec_mask);
+
+    staticInst->initiateAcc(gpuDynInst);
+    time = 0;
+}
+
+bool
+GPUDynInst::scalarOp() const
+{
+    return staticInst->scalarOp();
+}
+
+void
+GPUDynInst::updateStats()
+{
+    if (staticInst->isLocalMem()) {
+        // access to LDS (shared) memory
+        cu->dynamicLMemInstrCnt++;
+    } else {
+        // access to global memory
+
+        // update PageDivergence histogram
+        int number_pages_touched = cu->pagesTouched.size();
+        assert(number_pages_touched);
+        cu->pageDivergenceDist.sample(number_pages_touched);
+
+        std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;
+
+        for (auto it : cu->pagesTouched) {
+            // see if this page has been touched before. if not, this also
+            // inserts the page into the table.
+            ret = cu->pageAccesses
+                .insert(ComputeUnit::pageDataStruct::value_type(it.first,
+                        std::make_pair(1, it.second)));
+
+            // if yes, then update the stats
+            if (!ret.second) {
+                ret.first->second.first++;
+                ret.first->second.second += it.second;
+            }
+        }
+
+        cu->pagesTouched.clear();
+
+        // total number of memory instructions (dynamic)
+        // Atomics are counted as a single memory instruction.
+        // this is # memory instructions per wavefronts, not per workitem
+        cu->dynamicGMemInstrCnt++;
+    }
+}
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
new file mode 100644
index 000000000..e44d8f80d
--- /dev/null
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __GPU_DYN_INST_HH__
+#define __GPU_DYN_INST_HH__
+
+#include <cstdint>
+#include <string>
+
+#include "enums/GenericMemoryOrder.hh"
+#include "enums/GenericMemoryScope.hh"
+#include "enums/MemOpType.hh"
+#include "enums/MemType.hh"
+#include "enums/OpType.hh"
+#include "enums/StorageClassType.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_exec_context.hh"
+
+class GPUStaticInst;
+
+template<typename T>
+class AtomicOpAnd : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+
+    AtomicOpAnd(T _a) : a(_a) { }
+    void execute(T *b) { *b &= a; }
+};
+
+template<typename T>
+class AtomicOpOr : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpOr(T _a) : a(_a) { }
+    void execute(T *b) { *b |= a; }
+};
+
+template<typename T>
+class AtomicOpXor : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpXor(T _a) : a(_a) {}
+    void execute(T *b) { *b ^= a; }
+};
+
+template<typename T>
+class AtomicOpCAS : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T c;
+    T s;
+
+    ComputeUnit *computeUnit;
+
+    AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
+      : c(_c), s(_s), computeUnit(compute_unit) { }
+
+    void
+    execute(T *b)
+    {
+        computeUnit->numCASOps++;
+
+        if (*b == c) {
+            *b = s;
+        } else {
+            computeUnit->numFailedCASOps++;
+        }
+
+        if (computeUnit->xact_cas_mode) {
+            computeUnit->xactCasLoadMap.clear();
+        }
+    }
+};
+
+template<typename T>
+class AtomicOpExch : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpExch(T _a) : a(_a) { }
+    void execute(T *b) { *b = a; }
+};
+
+template<typename T>
+class AtomicOpAdd : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpAdd(T _a) : a(_a) { }
+    void execute(T *b) { *b += a; }
+};
+
+template<typename T>
+class AtomicOpSub : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpSub(T _a) : a(_a) { }
+    void execute(T *b) { *b -= a; }
+};
+
+template<typename T>
+class AtomicOpInc : public TypedAtomicOpFunctor<T>
+{
+  public:
+    AtomicOpInc() { }
+    void execute(T *b) { *b += 1; }
+};
+
+template<typename T>
+class AtomicOpDec : public TypedAtomicOpFunctor<T>
+{
+  public:
+    AtomicOpDec() {}
+    void execute(T *b) { *b -= 1; }
+};
+
+template<typename T>
+class AtomicOpMax : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpMax(T _a) : a(_a) { }
+
+    void
+    execute(T *b)
+    {
+        if (a > *b)
+            *b = a;
+    }
+};
+
+template<typename T>
+class AtomicOpMin : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpMin(T _a) : a(_a) {}
+
+    void
+    execute(T *b)
+    {
+        if (a < *b)
+            *b = a;
+    }
+};
+
+#define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN)
+#define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN)
+#define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN)
+
+typedef enum
+{
+    VT_32,
+    VT_64,
+} vgpr_type;
+
+typedef enum
+{
+    SEG_PRIVATE,
+    SEG_SPILL,
+    SEG_GLOBAL,
+    SEG_SHARED,
+    SEG_READONLY,
+    SEG_FLAT
+} seg_type;
+
+class GPUDynInst : public GPUExecContext
+{
+  public:
+    GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
+               uint64_t instSeqNum);
+
+    void execute();
+    int numSrcRegOperands();
+    int numDstRegOperands();
+    int getNumOperands();
+    bool isVectorRegister(int operandIdx);
+    bool isScalarRegister(int operandIdx);
+    int getRegisterIndex(int operandIdx);
+    int getOperandSize(int operandIdx);
+    bool isDstOperand(int operandIdx);
+    bool isSrcOperand(int operandIdx);
+    bool isArgLoad();
+
+    const std::string &disassemble() const;
+
+    uint64_t seqNum() const;
+
+    Enums::OpType opType();
+    Enums::StorageClassType executedAs();
+
+    // The address of the memory operation
+    Addr addr[VSZ];
+    Addr pAddr;
+
+    // The data to get written
+    uint8_t d_data[VSZ * 16];
+    // Additional data (for atomics)
+    uint8_t a_data[VSZ * 8];
+    // Additional data (for atomics)
+    uint8_t x_data[VSZ * 8];
+    // The execution mask
+    VectorMask exec_mask;
+
+    // The memory type (M_U32, M_S32, ...)
+    Enums::MemType m_type;
+    // The memory operation (MO_LD, MO_ST, ...)
+    Enums::MemOpType m_op;
+    Enums::GenericMemoryOrder memoryOrder;
+
+    // Scope of the request
+    Enums::GenericMemoryScope scope;
+    // The memory segment (SEG_SHARED, SEG_GLOBAL, ...)
+    seg_type s_type;
+    // The equivalency class
+    int equiv;
+    // The return VGPR type (VT_32 or VT_64)
+    vgpr_type v_type;
+    // Number of VGPR's accessed (1, 2, or 4)
+    int n_reg;
+    // The return VGPR index
+    int dst_reg;
+    // There can be max 4 dest regs>
+    int dst_reg_vec[4];
+    // SIMD where the WF of the memory instruction has been mapped to
+    int simdId;
+    // unique id of the WF where the memory instruction belongs to
+    int wfDynId;
+    // The kernel id of the requesting wf
+    int kern_id;
+    // The CU id of the requesting wf
+    int cu_id;
+    // HW slot id where the WF is mapped to inside a SIMD unit
+    int wfSlotId;
+    // execution pipeline id where the memory instruction has been scheduled
+    int pipeId;
+    // The execution time of this operation
+    Tick time;
+    // The latency of this operation
+    WaitClass latency;
+    // A list of bank conflicts for the 4 cycles.
+    uint32_t bc[4];
+
+    // A pointer to ROM
+    uint8_t *rom;
+    // The size of the READONLY segment
+    int sz_rom;
+
+    // Initiate the specified memory operation, by creating a
+    // memory request and sending it off to the memory system.
+    void initiateAcc(GPUDynInstPtr gpuDynInst);
+
+    void updateStats();
+
+    GPUStaticInst* staticInstruction() { return staticInst; }
+
+    // Is the instruction a scalar or vector op?
+    bool scalarOp() const;
+
+    /*
+     * Loads/stores/atomics may have acquire/release semantics associated
+     * withthem. Some protocols want to see the acquire/release as separate
+     * requests from the load/store/atomic. We implement that separation
+     * using continuations (i.e., a function pointer with an object associated
+     * with it). When, for example, the front-end generates a store with
+     * release semantics, we will first issue a normal store and set the
+     * continuation in the GPUDynInst to a function that generate a
+     * release request. That continuation will be called when the normal
+     * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
+     * continuation will be called in the context of the same GPUDynInst
+     * that generated the initial store.
+     */
+    std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
+
+    // when true, call execContinuation when response arrives
+    bool useContinuation;
+
+    template<typename c0> AtomicOpFunctor*
+    makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op)
+    {
+        using namespace Enums;
+
+        switch(op) {
+          case MO_AAND:
+          case MO_ANRAND:
+            return new AtomicOpAnd<c0>(*reg0);
+          case MO_AOR:
+          case MO_ANROR:
+            return new AtomicOpOr<c0>(*reg0);
+          case MO_AXOR:
+          case MO_ANRXOR:
+            return new AtomicOpXor<c0>(*reg0);
+          case MO_ACAS:
+          case MO_ANRCAS:
+            return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
+          case MO_AEXCH:
+          case MO_ANREXCH:
+            return new AtomicOpExch<c0>(*reg0);
+          case MO_AADD:
+          case MO_ANRADD:
+            return new AtomicOpAdd<c0>(*reg0);
+          case MO_ASUB:
+          case MO_ANRSUB:
+            return new AtomicOpSub<c0>(*reg0);
+          case MO_AINC:
+          case MO_ANRINC:
+            return new AtomicOpInc<c0>();
+          case MO_ADEC:
+          case MO_ANRDEC:
+            return new AtomicOpDec<c0>();
+          case MO_AMAX:
+          case MO_ANRMAX:
+            return new AtomicOpMax<c0>(*reg0);
+          case MO_AMIN:
+          case MO_ANRMIN:
+            return new AtomicOpMin<c0>(*reg0);
+          default:
+            panic("Unrecognized atomic operation");
+        }
+    }
+
+    void
+    setRequestFlags(Request *req, bool setMemOrder=true)
+    {
+        // currently these are the easy scopes to deduce
+        switch (s_type) {
+          case SEG_PRIVATE:
+            req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
+            break;
+          case SEG_SPILL:
+            req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
+            break;
+          case SEG_GLOBAL:
+            req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
+            break;
+          case SEG_READONLY:
+            req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
+            break;
+          case SEG_SHARED:
+            req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
+            break;
+          case SEG_FLAT:
+            // TODO: translate to correct scope
+            assert(false);
+          default:
+            panic("Bad segment type");
+            break;
+        }
+
+        switch (scope) {
+          case Enums::MEMORY_SCOPE_NONE:
+          case Enums::MEMORY_SCOPE_WORKITEM:
+            break;
+          case Enums::MEMORY_SCOPE_WAVEFRONT:
+            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+                                        Request::WAVEFRONT_SCOPE);
+            break;
+          case Enums::MEMORY_SCOPE_WORKGROUP:
+            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+                                        Request::WORKGROUP_SCOPE);
+            break;
+          case Enums::MEMORY_SCOPE_DEVICE:
+            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+                                        Request::DEVICE_SCOPE);
+            break;
+          case Enums::MEMORY_SCOPE_SYSTEM:
+            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+                                        Request::SYSTEM_SCOPE);
+            break;
+          default:
+            panic("Bad scope type");
+            break;
+        }
+
+        if (setMemOrder) {
+            // set acquire and release flags
+            switch (memoryOrder){
+              case Enums::MEMORY_ORDER_SC_ACQUIRE:
+                req->setFlags(Request::ACQUIRE);
+                break;
+              case Enums::MEMORY_ORDER_SC_RELEASE:
+                req->setFlags(Request::RELEASE);
+                break;
+              case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE:
+                req->setFlags(Request::ACQUIRE | Request::RELEASE);
+                break;
+              default:
+                break;
+            }
+        }
+
+        // set atomic type
+        // currently, the instruction genenerator only produces atomic return
+        // but a magic instruction can produce atomic no return
+        if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB ||
+            m_op == Enums::MO_AAND || m_op == Enums::MO_AOR ||
+            m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX ||
+            m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC ||
+            m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH ||
+            m_op == Enums::MO_ACAS) {
+            req->setFlags(Request::ATOMIC_RETURN_OP);
+        } else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB ||
+                   m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR ||
+                   m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX ||
+                   m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC ||
+                   m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH ||
+                   m_op == Enums::MO_ANRCAS) {
+            req->setFlags(Request::ATOMIC_NO_RETURN_OP);
+        }
+    }
+
+    // Map returned packets and the addresses they satisfy with which lane they
+    // were requested from
+    typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
+    StatusVector memStatusVector;
+
+    // Track the status of memory requests per lane, a bit per lane
+    VectorMask statusBitVector;
+    // for ld_v# or st_v#
+    std::vector<int> statusVector;
+    std::vector<int> tlbHitLevel;
+
+  private:
+    GPUStaticInst *staticInst;
+    uint64_t _seqNum;
+};
+
+#endif // __GPU_DYN_INST_HH__
diff --git a/src/gpu-compute/gpu_exec_context.cc b/src/gpu-compute/gpu_exec_context.cc
new file mode 100644
index 000000000..4af69c41e
--- /dev/null
+++ b/src/gpu-compute/gpu_exec_context.cc
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_exec_context.hh"
+
+GPUExecContext::GPUExecContext(ComputeUnit *_cu, Wavefront *_wf)
+    : cu(_cu), wf(_wf)
+{
+}
+
+ComputeUnit*
+GPUExecContext::computeUnit()
+{
+    return cu;
+}
+
+Wavefront*
+GPUExecContext::wavefront()
+{
+    return wf;
+}
diff --git a/src/gpu-compute/gpu_exec_context.hh b/src/gpu-compute/gpu_exec_context.hh
new file mode 100644
index 000000000..a3deb9b8f
--- /dev/null
+++ b/src/gpu-compute/gpu_exec_context.hh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __GPU_EXEC_CONTEXT_HH__
+#define __GPU_EXEC_CONTEXT_HH__
+
+class ComputeUnit;
+class Wavefront;
+
+class GPUExecContext
+{
+  public:
+    GPUExecContext(ComputeUnit *_cu, Wavefront *_wf);
+    Wavefront* wavefront();
+    ComputeUnit* computeUnit();
+
+  protected:
+    ComputeUnit *cu;
+    Wavefront *wf;
+};
+
+#endif // __GPU_EXEC_CONTEXT_HH__
diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc
new file mode 100644
index 000000000..bcb8a5f3d
--- /dev/null
+++ b/src/gpu-compute/gpu_static_inst.cc
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_static_inst.hh"
+
+GPUStaticInst::GPUStaticInst(const std::string &opcode)
+    : o_type(Enums::OT_ALU), executed_as(Enums::SC_NONE), opcode(opcode),
+      _instNum(0), _scalarOp(false)
+{
+}
diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh
new file mode 100644
index 000000000..c1de28427
--- /dev/null
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __GPU_STATIC_INST_HH__
+#define __GPU_STATIC_INST_HH__
+
+/*
+ * @file gpu_static_inst.hh
+ *
+ * Defines the base class representing static instructions for the GPU. The
+ * instructions are "static" because they contain no dynamic instruction
+ * information. GPUStaticInst corresponds to the StaticInst class for the CPU
+ * models.
+ */
+
+#include <cstdint>
+#include <string>
+
+#include "enums/OpType.hh"
+#include "enums/StorageClassType.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/misc.hh"
+
+class BaseOperand;
+class BaseRegOperand;
+class Wavefront;
+
+class GPUStaticInst
+{
+  public:
+    GPUStaticInst(const std::string &opcode);
+
+    void instNum(int num) { _instNum = num; }
+
+    int instNum() { return _instNum;  }
+
+    void ipdInstNum(int num) { _ipdInstNum = num; }
+
+    int ipdInstNum() const { return _ipdInstNum; }
+
+    virtual void execute(GPUDynInstPtr gpuDynInst) = 0;
+    virtual void generateDisassembly() = 0;
+    virtual const std::string &disassemble() = 0;
+    virtual int getNumOperands() = 0;
+    virtual bool isCondRegister(int operandIndex) = 0;
+    virtual bool isScalarRegister(int operandIndex) = 0;
+    virtual bool isVectorRegister(int operandIndex) = 0;
+    virtual bool isSrcOperand(int operandIndex) = 0;
+    virtual bool isDstOperand(int operandIndex) = 0;
+    virtual int getOperandSize(int operandIndex) = 0;
+    virtual int getRegisterIndex(int operandIndex) = 0;
+    virtual int numDstRegOperands() = 0;
+    virtual int numSrcRegOperands() = 0;
+
+    /*
+     * Most instructions (including all HSAIL instructions)
+     * are vector ops, so _scalarOp will be false by default.
+     * Derived instruction objects that are scalar ops must
+     * set _scalarOp to true in their constructors.
+     */
+    bool scalarOp() const { return _scalarOp; }
+
+    virtual bool isLocalMem() const
+    {
+        fatal("calling isLocalMem() on non-memory instruction.\n");
+
+        return false;
+    }
+
+    bool isArgLoad() { return false; }
+    virtual uint32_t instSize() = 0;
+
+    // only used for memory instructions
+    virtual void
+    initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        fatal("calling initiateAcc() on a non-memory instruction.\n");
+    }
+
+    virtual uint32_t getTargetPc() { return 0; }
+
+    /**
+     * Query whether the instruction is an unconditional jump i.e., the jump
+     * is always executed because there is no condition to be evaluated.
+     *
+     * If the instruction is not of branch type, the result is always false.
+     *
+     * @return True if the instruction is an unconditional jump.
+     */
+    virtual bool unconditionalJumpInstruction() { return false; }
+
+    static uint64_t dynamic_id_count;
+
+    Enums::OpType o_type;
+    // For flat memory accesses
+    Enums::StorageClassType executed_as;
+
+  protected:
+    virtual void
+    execLdAcq(GPUDynInstPtr gpuDynInst)
+    {
+        fatal("calling execLdAcq() on a non-load instruction.\n");
+    }
+
+    virtual void
+    execSt(GPUDynInstPtr gpuDynInst)
+    {
+        fatal("calling execLdAcq() on a non-load instruction.\n");
+    }
+
+    virtual void
+    execAtomic(GPUDynInstPtr gpuDynInst)
+    {
+        fatal("calling execAtomic() on a non-atomic instruction.\n");
+    }
+
+    virtual void
+    execAtomicAcq(GPUDynInstPtr gpuDynInst)
+    {
+        fatal("calling execAtomicAcq() on a non-atomic instruction.\n");
+    }
+
+    const std::string opcode;
+    std::string disassembly;
+    int _instNum;
+    /**
+     * Identifier of the immediate post-dominator instruction.
+     */
+    int _ipdInstNum;
+
+    bool _scalarOp;
+};
+
+#endif // __GPU_STATIC_INST_HH__
diff --git a/src/gpu-compute/gpu_tlb.cc b/src/gpu-compute/gpu_tlb.cc
new file mode 100644
index 000000000..de005fd04
--- /dev/null
+++ b/src/gpu-compute/gpu_tlb.cc
@@ -0,0 +1,1801 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#include "gpu-compute/gpu_tlb.hh"
+
+#include <cmath>
+#include <cstring>
+
+#include "arch/x86/faults.hh"
+#include "arch/x86/insts/microldstop.hh"
+#include "arch/x86/pagetable.hh"
+#include "arch/x86/pagetable_walker.hh"
+#include "arch/x86/regs/misc.hh"
+#include "arch/x86/x86_traits.hh"
+#include "base/bitfield.hh"
+#include "base/output.hh"
+#include "base/trace.hh"
+#include "cpu/base.hh"
+#include "cpu/thread_context.hh"
+#include "debug/GPUPrefetch.hh"
+#include "debug/GPUTLB.hh"
+#include "mem/packet_access.hh"
+#include "mem/page_table.hh"
+#include "mem/request.hh"
+#include "sim/process.hh"
+
+namespace X86ISA
+{
+
+    GpuTLB::GpuTLB(const Params *p)
+        : MemObject(p), configAddress(0), size(p->size),
+          cleanupEvent(this, false, Event::Maximum_Pri), exitEvent(this)
+    {
+        assoc = p->assoc;
+        assert(assoc <= size);
+        numSets = size/assoc;
+        allocationPolicy = p->allocationPolicy;
+        hasMemSidePort = false;
+        accessDistance = p->accessDistance;
+        clock = p->clk_domain->clockPeriod();
+
+        tlb = new GpuTlbEntry[size];
+        std::memset(tlb, 0, sizeof(GpuTlbEntry) * size);
+
+        freeList.resize(numSets);
+        entryList.resize(numSets);
+
+        for (int set = 0; set < numSets; ++set) {
+            for (int way = 0; way < assoc; ++way) {
+                int x = set*assoc + way;
+                freeList[set].push_back(&tlb[x]);
+            }
+        }
+
+        FA = (size == assoc);
+
+        /**
+         * @warning: the set-associative version assumes you have a
+         * fixed page size of 4KB.
+         * If the page size is greather than 4KB (as defined in the
+         * TheISA::PageBytes), then there are various issues w/ the current
+         * implementation (you'd have the same 8KB page being replicated in
+         * different sets etc)
+         */
+        setMask = numSets - 1;
+
+    #if 0
+        // GpuTLB doesn't yet support full system
+        walker = p->walker;
+        walker->setTLB(this);
+    #endif
+
+        maxCoalescedReqs = p->maxOutstandingReqs;
+
+        // Do not allow maxCoalescedReqs to be more than the TLB associativity
+        if (maxCoalescedReqs > assoc) {
+            maxCoalescedReqs = assoc;
+            cprintf("Forcing maxCoalescedReqs to %d (TLB assoc.) \n", assoc);
+        }
+
+        outstandingReqs = 0;
+        hitLatency = p->hitLatency;
+        missLatency1 = p->missLatency1;
+        missLatency2 = p->missLatency2;
+
+        // create the slave ports based on the number of connected ports
+        for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
+            cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d",
+                                  name(), i), this, i));
+        }
+
+        // create the master ports based on the number of connected ports
+        for (size_t i = 0; i < p->port_master_connection_count; ++i) {
+            memSidePort.push_back(new MemSidePort(csprintf("%s-port%d",
+                                  name(), i), this, i));
+        }
+    }
+
+    // fixme: this is never called?
+    GpuTLB::~GpuTLB()
+    {
+        // make sure all the hash-maps are empty
+        assert(translationReturnEvent.empty());
+
+        // delete the TLB
+        delete[] tlb;
+    }
+
+    BaseSlavePort&
+    GpuTLB::getSlavePort(const std::string &if_name, PortID idx)
+    {
+        if (if_name == "slave") {
+            if (idx >= static_cast<PortID>(cpuSidePort.size())) {
+                panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx);
+            }
+
+            return *cpuSidePort[idx];
+        } else {
+            panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name);
+        }
+    }
+
+    BaseMasterPort&
+    GpuTLB::getMasterPort(const std::string &if_name, PortID idx)
+    {
+        if (if_name == "master") {
+            if (idx >= static_cast<PortID>(memSidePort.size())) {
+                panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx);
+            }
+
+            hasMemSidePort = true;
+
+            return *memSidePort[idx];
+        } else {
+            panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name);
+        }
+    }
+
+    GpuTlbEntry*
+    GpuTLB::insert(Addr vpn, GpuTlbEntry &entry)
+    {
+        GpuTlbEntry *newEntry = nullptr;
+
+        /**
+         * vpn holds the virtual page address
+         * The least significant bits are simply masked
+         */
+        int set = (vpn >> TheISA::PageShift) & setMask;
+
+        if (!freeList[set].empty()) {
+            newEntry = freeList[set].front();
+            freeList[set].pop_front();
+        } else {
+            newEntry = entryList[set].back();
+            entryList[set].pop_back();
+        }
+
+        *newEntry = entry;
+        newEntry->vaddr = vpn;
+        entryList[set].push_front(newEntry);
+
+        return newEntry;
+    }
+
+    GpuTLB::EntryList::iterator
+    GpuTLB::lookupIt(Addr va, bool update_lru)
+    {
+        int set = (va >> TheISA::PageShift) & setMask;
+
+        if (FA) {
+            assert(!set);
+        }
+
+        auto entry = entryList[set].begin();
+        for (; entry != entryList[set].end(); ++entry) {
+            int page_size = (*entry)->size();
+
+            if ((*entry)->vaddr <= va && (*entry)->vaddr + page_size > va) {
+                DPRINTF(GPUTLB, "Matched vaddr %#x to entry starting at %#x "
+                        "with size %#x.\n", va, (*entry)->vaddr, page_size);
+
+                if (update_lru) {
+                    entryList[set].push_front(*entry);
+                    entryList[set].erase(entry);
+                    entry = entryList[set].begin();
+                }
+
+                break;
+            }
+        }
+
+        return entry;
+    }
+
+    GpuTlbEntry*
+    GpuTLB::lookup(Addr va, bool update_lru)
+    {
+        int set = (va >> TheISA::PageShift) & setMask;
+
+        auto entry = lookupIt(va, update_lru);
+
+        if (entry == entryList[set].end())
+            return nullptr;
+        else
+            return *entry;
+    }
+
+    void
+    GpuTLB::invalidateAll()
+    {
+        DPRINTF(GPUTLB, "Invalidating all entries.\n");
+
+        for (int i = 0; i < numSets; ++i) {
+            while (!entryList[i].empty()) {
+                GpuTlbEntry *entry = entryList[i].front();
+                entryList[i].pop_front();
+                freeList[i].push_back(entry);
+            }
+        }
+    }
+
+    void
+    GpuTLB::setConfigAddress(uint32_t addr)
+    {
+        configAddress = addr;
+    }
+
+    void
+    GpuTLB::invalidateNonGlobal()
+    {
+        DPRINTF(GPUTLB, "Invalidating all non global entries.\n");
+
+        for (int i = 0; i < numSets; ++i) {
+            for (auto entryIt = entryList[i].begin();
+                 entryIt != entryList[i].end();) {
+                if (!(*entryIt)->global) {
+                    freeList[i].push_back(*entryIt);
+                    entryList[i].erase(entryIt++);
+                } else {
+                    ++entryIt;
+                }
+            }
+        }
+    }
+
+    void
+    GpuTLB::demapPage(Addr va, uint64_t asn)
+    {
+
+        int set = (va >> TheISA::PageShift) & setMask;
+        auto entry = lookupIt(va, false);
+
+        if (entry != entryList[set].end()) {
+            freeList[set].push_back(*entry);
+            entryList[set].erase(entry);
+        }
+    }
+
+    Fault
+    GpuTLB::translateInt(RequestPtr req, ThreadContext *tc)
+    {
+        DPRINTF(GPUTLB, "Addresses references internal memory.\n");
+        Addr vaddr = req->getVaddr();
+        Addr prefix = (vaddr >> 3) & IntAddrPrefixMask;
+
+        if (prefix == IntAddrPrefixCPUID) {
+            panic("CPUID memory space not yet implemented!\n");
+        } else if (prefix == IntAddrPrefixMSR) {
+            vaddr = vaddr >> 3;
+            req->setFlags(Request::MMAPPED_IPR);
+            Addr regNum = 0;
+
+            switch (vaddr & ~IntAddrPrefixMask) {
+              case 0x10:
+                regNum = MISCREG_TSC;
+                break;
+              case 0x1B:
+                regNum = MISCREG_APIC_BASE;
+                break;
+              case 0xFE:
+                regNum = MISCREG_MTRRCAP;
+                break;
+              case 0x174:
+                regNum = MISCREG_SYSENTER_CS;
+                break;
+              case 0x175:
+                regNum = MISCREG_SYSENTER_ESP;
+                break;
+              case 0x176:
+                regNum = MISCREG_SYSENTER_EIP;
+                break;
+              case 0x179:
+                regNum = MISCREG_MCG_CAP;
+                break;
+              case 0x17A:
+                regNum = MISCREG_MCG_STATUS;
+                break;
+              case 0x17B:
+                regNum = MISCREG_MCG_CTL;
+                break;
+              case 0x1D9:
+                regNum = MISCREG_DEBUG_CTL_MSR;
+                break;
+              case 0x1DB:
+                regNum = MISCREG_LAST_BRANCH_FROM_IP;
+                break;
+              case 0x1DC:
+                regNum = MISCREG_LAST_BRANCH_TO_IP;
+                break;
+              case 0x1DD:
+                regNum = MISCREG_LAST_EXCEPTION_FROM_IP;
+                break;
+              case 0x1DE:
+                regNum = MISCREG_LAST_EXCEPTION_TO_IP;
+                break;
+              case 0x200:
+                regNum = MISCREG_MTRR_PHYS_BASE_0;
+                break;
+              case 0x201:
+                regNum = MISCREG_MTRR_PHYS_MASK_0;
+                break;
+              case 0x202:
+                regNum = MISCREG_MTRR_PHYS_BASE_1;
+                break;
+              case 0x203:
+                regNum = MISCREG_MTRR_PHYS_MASK_1;
+                break;
+              case 0x204:
+                regNum = MISCREG_MTRR_PHYS_BASE_2;
+                break;
+              case 0x205:
+                regNum = MISCREG_MTRR_PHYS_MASK_2;
+                break;
+              case 0x206:
+                regNum = MISCREG_MTRR_PHYS_BASE_3;
+                break;
+              case 0x207:
+                regNum = MISCREG_MTRR_PHYS_MASK_3;
+                break;
+              case 0x208:
+                regNum = MISCREG_MTRR_PHYS_BASE_4;
+                break;
+              case 0x209:
+                regNum = MISCREG_MTRR_PHYS_MASK_4;
+                break;
+              case 0x20A:
+                regNum = MISCREG_MTRR_PHYS_BASE_5;
+                break;
+              case 0x20B:
+                regNum = MISCREG_MTRR_PHYS_MASK_5;
+                break;
+              case 0x20C:
+                regNum = MISCREG_MTRR_PHYS_BASE_6;
+                break;
+              case 0x20D:
+                regNum = MISCREG_MTRR_PHYS_MASK_6;
+                break;
+              case 0x20E:
+                regNum = MISCREG_MTRR_PHYS_BASE_7;
+                break;
+              case 0x20F:
+                regNum = MISCREG_MTRR_PHYS_MASK_7;
+                break;
+              case 0x250:
+                regNum = MISCREG_MTRR_FIX_64K_00000;
+                break;
+              case 0x258:
+                regNum = MISCREG_MTRR_FIX_16K_80000;
+                break;
+              case 0x259:
+                regNum = MISCREG_MTRR_FIX_16K_A0000;
+                break;
+              case 0x268:
+                regNum = MISCREG_MTRR_FIX_4K_C0000;
+                break;
+              case 0x269:
+                regNum = MISCREG_MTRR_FIX_4K_C8000;
+                break;
+              case 0x26A:
+                regNum = MISCREG_MTRR_FIX_4K_D0000;
+                break;
+              case 0x26B:
+                regNum = MISCREG_MTRR_FIX_4K_D8000;
+                break;
+              case 0x26C:
+                regNum = MISCREG_MTRR_FIX_4K_E0000;
+                break;
+              case 0x26D:
+                regNum = MISCREG_MTRR_FIX_4K_E8000;
+                break;
+              case 0x26E:
+                regNum = MISCREG_MTRR_FIX_4K_F0000;
+                break;
+              case 0x26F:
+                regNum = MISCREG_MTRR_FIX_4K_F8000;
+                break;
+              case 0x277:
+                regNum = MISCREG_PAT;
+                break;
+              case 0x2FF:
+                regNum = MISCREG_DEF_TYPE;
+                break;
+              case 0x400:
+                regNum = MISCREG_MC0_CTL;
+                break;
+              case 0x404:
+                regNum = MISCREG_MC1_CTL;
+                break;
+              case 0x408:
+                regNum = MISCREG_MC2_CTL;
+                break;
+              case 0x40C:
+                regNum = MISCREG_MC3_CTL;
+                break;
+              case 0x410:
+                regNum = MISCREG_MC4_CTL;
+                break;
+              case 0x414:
+                regNum = MISCREG_MC5_CTL;
+                break;
+              case 0x418:
+                regNum = MISCREG_MC6_CTL;
+                break;
+              case 0x41C:
+                regNum = MISCREG_MC7_CTL;
+                break;
+              case 0x401:
+                regNum = MISCREG_MC0_STATUS;
+                break;
+              case 0x405:
+                regNum = MISCREG_MC1_STATUS;
+                break;
+              case 0x409:
+                regNum = MISCREG_MC2_STATUS;
+                break;
+              case 0x40D:
+                regNum = MISCREG_MC3_STATUS;
+                break;
+              case 0x411:
+                regNum = MISCREG_MC4_STATUS;
+                break;
+              case 0x415:
+                regNum = MISCREG_MC5_STATUS;
+                break;
+              case 0x419:
+                regNum = MISCREG_MC6_STATUS;
+                break;
+              case 0x41D:
+                regNum = MISCREG_MC7_STATUS;
+                break;
+              case 0x402:
+                regNum = MISCREG_MC0_ADDR;
+                break;
+              case 0x406:
+                regNum = MISCREG_MC1_ADDR;
+                break;
+              case 0x40A:
+                regNum = MISCREG_MC2_ADDR;
+                break;
+              case 0x40E:
+                regNum = MISCREG_MC3_ADDR;
+                break;
+              case 0x412:
+                regNum = MISCREG_MC4_ADDR;
+                break;
+              case 0x416:
+                regNum = MISCREG_MC5_ADDR;
+                break;
+              case 0x41A:
+                regNum = MISCREG_MC6_ADDR;
+                break;
+              case 0x41E:
+                regNum = MISCREG_MC7_ADDR;
+                break;
+              case 0x403:
+                regNum = MISCREG_MC0_MISC;
+                break;
+              case 0x407:
+                regNum = MISCREG_MC1_MISC;
+                break;
+              case 0x40B:
+                regNum = MISCREG_MC2_MISC;
+                break;
+              case 0x40F:
+                regNum = MISCREG_MC3_MISC;
+                break;
+              case 0x413:
+                regNum = MISCREG_MC4_MISC;
+                break;
+              case 0x417:
+                regNum = MISCREG_MC5_MISC;
+                break;
+              case 0x41B:
+                regNum = MISCREG_MC6_MISC;
+                break;
+              case 0x41F:
+                regNum = MISCREG_MC7_MISC;
+                break;
+              case 0xC0000080:
+                regNum = MISCREG_EFER;
+                break;
+              case 0xC0000081:
+                regNum = MISCREG_STAR;
+                break;
+              case 0xC0000082:
+                regNum = MISCREG_LSTAR;
+                break;
+              case 0xC0000083:
+                regNum = MISCREG_CSTAR;
+                break;
+              case 0xC0000084:
+                regNum = MISCREG_SF_MASK;
+                break;
+              case 0xC0000100:
+                regNum = MISCREG_FS_BASE;
+                break;
+              case 0xC0000101:
+                regNum = MISCREG_GS_BASE;
+                break;
+              case 0xC0000102:
+                regNum = MISCREG_KERNEL_GS_BASE;
+                break;
+              case 0xC0000103:
+                regNum = MISCREG_TSC_AUX;
+                break;
+              case 0xC0010000:
+                regNum = MISCREG_PERF_EVT_SEL0;
+                break;
+              case 0xC0010001:
+                regNum = MISCREG_PERF_EVT_SEL1;
+                break;
+              case 0xC0010002:
+                regNum = MISCREG_PERF_EVT_SEL2;
+                break;
+              case 0xC0010003:
+                regNum = MISCREG_PERF_EVT_SEL3;
+                break;
+              case 0xC0010004:
+                regNum = MISCREG_PERF_EVT_CTR0;
+                break;
+              case 0xC0010005:
+                regNum = MISCREG_PERF_EVT_CTR1;
+                break;
+              case 0xC0010006:
+                regNum = MISCREG_PERF_EVT_CTR2;
+                break;
+              case 0xC0010007:
+                regNum = MISCREG_PERF_EVT_CTR3;
+                break;
+              case 0xC0010010:
+                regNum = MISCREG_SYSCFG;
+                break;
+              case 0xC0010016:
+                regNum = MISCREG_IORR_BASE0;
+                break;
+              case 0xC0010017:
+                regNum = MISCREG_IORR_BASE1;
+                break;
+              case 0xC0010018:
+                regNum = MISCREG_IORR_MASK0;
+                break;
+              case 0xC0010019:
+                regNum = MISCREG_IORR_MASK1;
+                break;
+              case 0xC001001A:
+                regNum = MISCREG_TOP_MEM;
+                break;
+              case 0xC001001D:
+                regNum = MISCREG_TOP_MEM2;
+                break;
+              case 0xC0010114:
+                regNum = MISCREG_VM_CR;
+                break;
+              case 0xC0010115:
+                regNum = MISCREG_IGNNE;
+                break;
+              case 0xC0010116:
+                regNum = MISCREG_SMM_CTL;
+                break;
+              case 0xC0010117:
+                regNum = MISCREG_VM_HSAVE_PA;
+                break;
+              default:
+                return std::make_shared<GeneralProtection>(0);
+            }
+            //The index is multiplied by the size of a MiscReg so that
+            //any memory dependence calculations will not see these as
+            //overlapping.
+            req->setPaddr(regNum * sizeof(MiscReg));
+            return NoFault;
+        } else if (prefix == IntAddrPrefixIO) {
+            // TODO If CPL > IOPL or in virtual mode, check the I/O permission
+            // bitmap in the TSS.
+
+            Addr IOPort = vaddr & ~IntAddrPrefixMask;
+            // Make sure the address fits in the expected 16 bit IO address
+            // space.
+            assert(!(IOPort & ~0xFFFF));
+
+            if (IOPort == 0xCF8 && req->getSize() == 4) {
+                req->setFlags(Request::MMAPPED_IPR);
+                req->setPaddr(MISCREG_PCI_CONFIG_ADDRESS * sizeof(MiscReg));
+            } else if ((IOPort & ~mask(2)) == 0xCFC) {
+                req->setFlags(Request::UNCACHEABLE);
+
+                Addr configAddress =
+                    tc->readMiscRegNoEffect(MISCREG_PCI_CONFIG_ADDRESS);
+
+                if (bits(configAddress, 31, 31)) {
+                    req->setPaddr(PhysAddrPrefixPciConfig |
+                                  mbits(configAddress, 30, 2) |
+                                  (IOPort & mask(2)));
+                } else {
+                    req->setPaddr(PhysAddrPrefixIO | IOPort);
+                }
+            } else {
+                req->setFlags(Request::UNCACHEABLE);
+                req->setPaddr(PhysAddrPrefixIO | IOPort);
+            }
+            return NoFault;
+        } else {
+            panic("Access to unrecognized internal address space %#x.\n",
+                  prefix);
+        }
+    }
+
+    /**
+     * TLB_lookup will only perform a TLB lookup returning true on a TLB hit
+     * and false on a TLB miss.
+     * Many of the checks about different modes have been converted to
+     * assertions, since these parts of the code are not really used.
+     * On a hit it will update the LRU stack.
+     */
+    bool
+    GpuTLB::tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats)
+    {
+        bool tlb_hit = false;
+    #ifndef NDEBUG
+        uint32_t flags = req->getFlags();
+        int seg = flags & SegmentFlagMask;
+    #endif
+
+        assert(seg != SEGMENT_REG_MS);
+        Addr vaddr = req->getVaddr();
+        DPRINTF(GPUTLB, "TLB Lookup for vaddr %#x.\n", vaddr);
+        HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
+
+        if (m5Reg.prot) {
+            DPRINTF(GPUTLB, "In protected mode.\n");
+            // make sure we are in 64-bit mode
+            assert(m5Reg.mode == LongMode);
+
+            // If paging is enabled, do the translation.
+            if (m5Reg.paging) {
+                DPRINTF(GPUTLB, "Paging enabled.\n");
+                //update LRU stack on a hit
+                GpuTlbEntry *entry = lookup(vaddr, true);
+
+                if (entry)
+                    tlb_hit = true;
+
+                if (!update_stats) {
+                    // functional tlb access for memory initialization
+                    // i.e., memory seeding or instr. seeding -> don't update
+                    // TLB and stats
+                    return tlb_hit;
+                }
+
+                localNumTLBAccesses++;
+
+                if (!entry) {
+                    localNumTLBMisses++;
+                } else {
+                    localNumTLBHits++;
+                }
+            }
+        }
+
+        return tlb_hit;
+    }
+
+    Fault
+    GpuTLB::translate(RequestPtr req, ThreadContext *tc,
+                      Translation *translation, Mode mode,
+                      bool &delayedResponse, bool timing, int &latency)
+    {
+        uint32_t flags = req->getFlags();
+        int seg = flags & SegmentFlagMask;
+        bool storeCheck = flags & (StoreCheck << FlagShift);
+
+        // If this is true, we're dealing with a request
+        // to a non-memory address space.
+        if (seg == SEGMENT_REG_MS) {
+            return translateInt(req, tc);
+        }
+
+        delayedResponse = false;
+        Addr vaddr = req->getVaddr();
+        DPRINTF(GPUTLB, "Translating vaddr %#x.\n", vaddr);
+
+        HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
+
+        // If protected mode has been enabled...
+        if (m5Reg.prot) {
+            DPRINTF(GPUTLB, "In protected mode.\n");
+            // If we're not in 64-bit mode, do protection/limit checks
+            if (m5Reg.mode != LongMode) {
+                DPRINTF(GPUTLB, "Not in long mode. Checking segment "
+                        "protection.\n");
+
+                // Check for a null segment selector.
+                if (!(seg == SEGMENT_REG_TSG || seg == SYS_SEGMENT_REG_IDTR ||
+                    seg == SEGMENT_REG_HS || seg == SEGMENT_REG_LS)
+                    && !tc->readMiscRegNoEffect(MISCREG_SEG_SEL(seg))) {
+                    return std::make_shared<GeneralProtection>(0);
+                }
+
+                bool expandDown = false;
+                SegAttr attr = tc->readMiscRegNoEffect(MISCREG_SEG_ATTR(seg));
+
+                if (seg >= SEGMENT_REG_ES && seg <= SEGMENT_REG_HS) {
+                    if (!attr.writable && (mode == BaseTLB::Write ||
+                        storeCheck))
+                        return std::make_shared<GeneralProtection>(0);
+
+                    if (!attr.readable && mode == BaseTLB::Read)
+                        return std::make_shared<GeneralProtection>(0);
+
+                    expandDown = attr.expandDown;
+
+                }
+
+                Addr base = tc->readMiscRegNoEffect(MISCREG_SEG_BASE(seg));
+                Addr limit = tc->readMiscRegNoEffect(MISCREG_SEG_LIMIT(seg));
+                // This assumes we're not in 64 bit mode. If we were, the
+                // default address size is 64 bits, overridable to 32.
+                int size = 32;
+                bool sizeOverride = (flags & (AddrSizeFlagBit << FlagShift));
+                SegAttr csAttr = tc->readMiscRegNoEffect(MISCREG_CS_ATTR);
+
+                if ((csAttr.defaultSize && sizeOverride) ||
+                    (!csAttr.defaultSize && !sizeOverride)) {
+                    size = 16;
+                }
+
+                Addr offset = bits(vaddr - base, size - 1, 0);
+                Addr endOffset = offset + req->getSize() - 1;
+
+                if (expandDown) {
+                    DPRINTF(GPUTLB, "Checking an expand down segment.\n");
+                    warn_once("Expand down segments are untested.\n");
+
+                    if (offset <= limit || endOffset <= limit)
+                        return std::make_shared<GeneralProtection>(0);
+                } else {
+                    if (offset > limit || endOffset > limit)
+                        return std::make_shared<GeneralProtection>(0);
+                }
+            }
+
+            // If paging is enabled, do the translation.
+            if (m5Reg.paging) {
+                DPRINTF(GPUTLB, "Paging enabled.\n");
+                // The vaddr already has the segment base applied.
+                GpuTlbEntry *entry = lookup(vaddr);
+                localNumTLBAccesses++;
+
+                if (!entry) {
+                    localNumTLBMisses++;
+                    if (timing) {
+                        latency = missLatency1;
+                    }
+
+                    if (FullSystem) {
+                        fatal("GpuTLB doesn't support full-system mode\n");
+                    } else {
+                        DPRINTF(GPUTLB, "Handling a TLB miss for address %#x "
+                                "at pc %#x.\n", vaddr, tc->instAddr());
+
+                        Process *p = tc->getProcessPtr();
+                        GpuTlbEntry newEntry;
+                        bool success = p->pTable->lookup(vaddr, newEntry);
+
+                        if (!success && mode != BaseTLB::Execute) {
+                            // penalize a "page fault" more
+                            if (timing) {
+                                latency += missLatency2;
+                            }
+
+                            if (p->fixupStackFault(vaddr))
+                                success = p->pTable->lookup(vaddr, newEntry);
+                        }
+
+                        if (!success) {
+                            return std::make_shared<PageFault>(vaddr, true,
+                                                               mode, true,
+                                                               false);
+                        } else {
+                            newEntry.valid = success;
+                            Addr alignedVaddr = p->pTable->pageAlign(vaddr);
+
+                            DPRINTF(GPUTLB, "Mapping %#x to %#x\n",
+                                    alignedVaddr, newEntry.pageStart());
+
+                            entry = insert(alignedVaddr, newEntry);
+                        }
+
+                        DPRINTF(GPUTLB, "Miss was serviced.\n");
+                    }
+                } else {
+                    localNumTLBHits++;
+
+                    if (timing) {
+                        latency = hitLatency;
+                    }
+                }
+
+                // Do paging protection checks.
+                bool inUser = (m5Reg.cpl == 3 &&
+                               !(flags & (CPL0FlagBit << FlagShift)));
+
+                CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
+                bool badWrite = (!entry->writable && (inUser || cr0.wp));
+
+                if ((inUser && !entry->user) || (mode == BaseTLB::Write &&
+                     badWrite)) {
+                    // The page must have been present to get into the TLB in
+                    // the first place. We'll assume the reserved bits are
+                    // fine even though we're not checking them.
+                    return std::make_shared<PageFault>(vaddr, true, mode,
+                                                       inUser, false);
+                }
+
+                if (storeCheck && badWrite) {
+                    // This would fault if this were a write, so return a page
+                    // fault that reflects that happening.
+                    return std::make_shared<PageFault>(vaddr, true,
+                                                       BaseTLB::Write,
+                                                       inUser, false);
+                }
+
+
+                DPRINTF(GPUTLB, "Entry found with paddr %#x, doing protection "
+                        "checks.\n", entry->paddr);
+
+                int page_size = entry->size();
+                Addr paddr = entry->paddr | (vaddr & (page_size - 1));
+                DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
+                req->setPaddr(paddr);
+
+                if (entry->uncacheable)
+                    req->setFlags(Request::UNCACHEABLE);
+            } else {
+                //Use the address which already has segmentation applied.
+                DPRINTF(GPUTLB, "Paging disabled.\n");
+                DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
+                req->setPaddr(vaddr);
+            }
+        } else {
+            // Real mode
+            DPRINTF(GPUTLB, "In real mode.\n");
+            DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
+            req->setPaddr(vaddr);
+        }
+
+        // Check for an access to the local APIC
+        if (FullSystem) {
+            LocalApicBase localApicBase =
+                tc->readMiscRegNoEffect(MISCREG_APIC_BASE);
+
+            Addr baseAddr = localApicBase.base * PageBytes;
+            Addr paddr = req->getPaddr();
+
+            if (baseAddr <= paddr && baseAddr + PageBytes > paddr) {
+                // Force the access to be uncacheable.
+                req->setFlags(Request::UNCACHEABLE);
+                req->setPaddr(x86LocalAPICAddress(tc->contextId(),
+                                                  paddr - baseAddr));
+            }
+        }
+
+        return NoFault;
+    };
+
+    Fault
+    GpuTLB::translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
+                            int &latency)
+    {
+        bool delayedResponse;
+
+        return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false,
+                                 latency);
+    }
+
+    void
+    GpuTLB::translateTiming(RequestPtr req, ThreadContext *tc,
+            Translation *translation, Mode mode, int &latency)
+    {
+        bool delayedResponse;
+        assert(translation);
+
+        Fault fault = GpuTLB::translate(req, tc, translation, mode,
+                                        delayedResponse, true, latency);
+
+        if (!delayedResponse)
+            translation->finish(fault, req, tc, mode);
+    }
+
+    Walker*
+    GpuTLB::getWalker()
+    {
+        return walker;
+    }
+
+
+    void
+    GpuTLB::serialize(CheckpointOut &cp) const
+    {
+    }
+
+    void
+    GpuTLB::unserialize(CheckpointIn &cp)
+    {
+    }
+
+    void
+    GpuTLB::regStats()
+    {
+        localNumTLBAccesses
+            .name(name() + ".local_TLB_accesses")
+            .desc("Number of TLB accesses")
+            ;
+
+        localNumTLBHits
+            .name(name() + ".local_TLB_hits")
+            .desc("Number of TLB hits")
+            ;
+
+        localNumTLBMisses
+            .name(name() + ".local_TLB_misses")
+            .desc("Number of TLB misses")
+            ;
+
+        localTLBMissRate
+            .name(name() + ".local_TLB_miss_rate")
+            .desc("TLB miss rate")
+            ;
+
+        accessCycles
+            .name(name() + ".access_cycles")
+            .desc("Cycles spent accessing this TLB level")
+            ;
+
+        pageTableCycles
+            .name(name() + ".page_table_cycles")
+            .desc("Cycles spent accessing the page table")
+            ;
+
+        localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
+
+        numUniquePages
+            .name(name() + ".unique_pages")
+            .desc("Number of unique pages touched")
+            ;
+
+        localCycles
+            .name(name() + ".local_cycles")
+            .desc("Number of cycles spent in queue for all incoming reqs")
+            ;
+
+        localLatency
+            .name(name() + ".local_latency")
+            .desc("Avg. latency over incoming coalesced reqs")
+            ;
+
+        localLatency = localCycles / localNumTLBAccesses;
+
+        globalNumTLBAccesses
+            .name(name() + ".global_TLB_accesses")
+            .desc("Number of TLB accesses")
+            ;
+
+        globalNumTLBHits
+            .name(name() + ".global_TLB_hits")
+            .desc("Number of TLB hits")
+            ;
+
+        globalNumTLBMisses
+            .name(name() + ".global_TLB_misses")
+            .desc("Number of TLB misses")
+            ;
+
+        globalTLBMissRate
+            .name(name() + ".global_TLB_miss_rate")
+            .desc("TLB miss rate")
+            ;
+
+        globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
+
+        avgReuseDistance
+            .name(name() + ".avg_reuse_distance")
+            .desc("avg. reuse distance over all pages (in ticks)")
+            ;
+
+    }
+
+    /**
+     * Do the TLB lookup for this coalesced request and schedule
+     * another event <TLB access latency> cycles later.
+     */
+
+    void
+    GpuTLB::issueTLBLookup(PacketPtr pkt)
+    {
+        assert(pkt);
+        assert(pkt->senderState);
+
+        Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
+                                        TheISA::PageBytes);
+
+        TranslationState *sender_state =
+                safe_cast<TranslationState*>(pkt->senderState);
+
+        bool update_stats = !sender_state->prefetch;
+        ThreadContext * tmp_tc = sender_state->tc;
+
+        DPRINTF(GPUTLB, "Translation req. for virt. page addr %#x\n",
+                virt_page_addr);
+
+        int req_cnt = sender_state->reqCnt.back();
+
+        if (update_stats) {
+            accessCycles -= (curTick() * req_cnt);
+            localCycles -= curTick();
+            updatePageFootprint(virt_page_addr);
+            globalNumTLBAccesses += req_cnt;
+        }
+
+        tlbOutcome lookup_outcome = TLB_MISS;
+        RequestPtr tmp_req = pkt->req;
+
+        // Access the TLB and figure out if it's a hit or a miss.
+        bool success = tlbLookup(tmp_req, tmp_tc, update_stats);
+
+        if (success) {
+            lookup_outcome = TLB_HIT;
+            // Put the entry in SenderState
+            GpuTlbEntry *entry = lookup(tmp_req->getVaddr(), false);
+            assert(entry);
+
+            sender_state->tlbEntry =
+                new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid);
+
+            if (update_stats) {
+                // the reqCnt has an entry per level, so its size tells us
+                // which level we are in
+                sender_state->hitLevel = sender_state->reqCnt.size();
+                globalNumTLBHits += req_cnt;
+            }
+        } else {
+            if (update_stats)
+                globalNumTLBMisses += req_cnt;
+        }
+
+        /*
+         * We now know the TLB lookup outcome (if it's a hit or a miss), as well
+         * as the TLB access latency.
+         *
+         * We create and schedule a new TLBEvent which will help us take the
+         * appropriate actions (e.g., update TLB on a hit, send request to lower
+         * level TLB on a miss, or start a page walk if this was the last-level
+         * TLB)
+         */
+        TLBEvent *tlb_event =
+            new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);
+
+        if (translationReturnEvent.count(virt_page_addr)) {
+            panic("Virtual Page Address %#x already has a return event\n",
+                  virt_page_addr);
+        }
+
+        translationReturnEvent[virt_page_addr] = tlb_event;
+        assert(tlb_event);
+
+        DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",
+                curTick() + this->ticks(hitLatency));
+
+        schedule(tlb_event, curTick() + this->ticks(hitLatency));
+    }
+
+    GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome,
+                               PacketPtr _pkt)
+        : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
+        outcome(tlb_outcome), pkt(_pkt)
+    {
+    }
+
+    /**
+     * Do Paging protection checks. If we encounter a page fault, then
+     * an assertion is fired.
+     */
+    void
+    GpuTLB::pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
+            GpuTlbEntry * tlb_entry, Mode mode)
+    {
+        HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
+        uint32_t flags = pkt->req->getFlags();
+        bool storeCheck = flags & (StoreCheck << FlagShift);
+
+        // Do paging protection checks.
+        bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
+        CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
+
+        bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp));
+
+        if ((inUser && !tlb_entry->user) ||
+            (mode == BaseTLB::Write && badWrite)) {
+           // The page must have been present to get into the TLB in
+           // the first place. We'll assume the reserved bits are
+           // fine even though we're not checking them.
+           assert(false);
+        }
+
+        if (storeCheck && badWrite) {
+           // This would fault if this were a write, so return a page
+           // fault that reflects that happening.
+           assert(false);
+        }
+    }
+
+    /**
+     * handleTranslationReturn is called on a TLB hit,
+     * when a TLB miss returns or when a page fault returns.
+     * The latter calls handelHit with TLB miss as tlbOutcome.
+     */
+    void
+    GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome,
+            PacketPtr pkt)
+    {
+
+        assert(pkt);
+        Addr vaddr = pkt->req->getVaddr();
+
+        TranslationState *sender_state =
+            safe_cast<TranslationState*>(pkt->senderState);
+
+        ThreadContext *tc = sender_state->tc;
+        Mode mode = sender_state->tlbMode;
+
+        GpuTlbEntry *local_entry, *new_entry;
+
+        if (tlb_outcome == TLB_HIT) {
+            DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr);
+            local_entry = sender_state->tlbEntry;
+        } else {
+            DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
+                    vaddr);
+
+            // We are returning either from a page walk or from a hit at a lower
+            // TLB level. The senderState should be "carrying" a pointer to the
+            // correct TLBEntry.
+            new_entry = sender_state->tlbEntry;
+            assert(new_entry);
+            local_entry = new_entry;
+
+            if (allocationPolicy) {
+                DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
+                        virt_page_addr);
+
+                local_entry = insert(virt_page_addr, *new_entry);
+            }
+
+            assert(local_entry);
+        }
+
+        /**
+         * At this point the packet carries an up-to-date tlbEntry pointer
+         * in its senderState.
+         * Next step is to do the paging protection checks.
+         */
+        DPRINTF(GPUTLB, "Entry found with vaddr %#x,  doing protection checks "
+                "while paddr was %#x.\n", local_entry->vaddr,
+                local_entry->paddr);
+
+        pagingProtectionChecks(tc, pkt, local_entry, mode);
+        int page_size = local_entry->size();
+        Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
+        DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
+
+        // Since this packet will be sent through the cpu side slave port,
+        // it must be converted to a response pkt if it is not one already
+        if (pkt->isRequest()) {
+            pkt->makeTimingResponse();
+        }
+
+        pkt->req->setPaddr(paddr);
+
+        if (local_entry->uncacheable) {
+             pkt->req->setFlags(Request::UNCACHEABLE);
+        }
+
+        //send packet back to coalescer
+        cpuSidePort[0]->sendTimingResp(pkt);
+        //schedule cleanup event
+        cleanupQueue.push(virt_page_addr);
+
+        // schedule this only once per cycle.
+        // The check is required because we might have multiple translations
+        // returning the same cycle
+        // this is a maximum priority event and must be on the same cycle
+        // as the cleanup event in TLBCoalescer to avoid a race with
+        // IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry
+        if (!cleanupEvent.scheduled())
+            schedule(cleanupEvent, curTick());
+    }
+
+    /**
+     * Here we take the appropriate actions based on the result of the
+     * TLB lookup.
+     */
+    void
+    GpuTLB::translationReturn(Addr virtPageAddr, tlbOutcome outcome,
+                              PacketPtr pkt)
+    {
+        DPRINTF(GPUTLB, "Triggered TLBEvent for addr %#x\n", virtPageAddr);
+
+        assert(translationReturnEvent[virtPageAddr]);
+        assert(pkt);
+
+        TranslationState *tmp_sender_state =
+            safe_cast<TranslationState*>(pkt->senderState);
+
+        int req_cnt = tmp_sender_state->reqCnt.back();
+        bool update_stats = !tmp_sender_state->prefetch;
+
+
+        if (outcome == TLB_HIT) {
+            handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);
+
+            if (update_stats) {
+                accessCycles += (req_cnt * curTick());
+                localCycles += curTick();
+            }
+
+        } else if (outcome == TLB_MISS) {
+
+            DPRINTF(GPUTLB, "This is a TLB miss\n");
+            if (update_stats) {
+                accessCycles += (req_cnt*curTick());
+                localCycles += curTick();
+            }
+
+            if (hasMemSidePort) {
+                // the one cyle added here represent the delay from when we get
+                // the reply back till when we propagate it to the coalescer
+                // above.
+                if (update_stats) {
+                    accessCycles += (req_cnt * 1);
+                    localCycles += 1;
+                }
+
+                /**
+                 * There is a TLB below. Send the coalesced request.
+                 * We actually send the very first packet of all the
+                 * pending packets for this virtual page address.
+                 */
+                if (!memSidePort[0]->sendTimingReq(pkt)) {
+                    DPRINTF(GPUTLB, "Failed sending translation request to "
+                            "lower level TLB for addr %#x\n", virtPageAddr);
+
+                    memSidePort[0]->retries.push_back(pkt);
+                } else {
+                    DPRINTF(GPUTLB, "Sent translation request to lower level "
+                            "TLB for addr %#x\n", virtPageAddr);
+                }
+            } else {
+                //this is the last level TLB. Start a page walk
+                DPRINTF(GPUTLB, "Last level TLB - start a page walk for "
+                        "addr %#x\n", virtPageAddr);
+
+                if (update_stats)
+                    pageTableCycles -= (req_cnt*curTick());
+
+                TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
+                assert(tlb_event);
+                tlb_event->updateOutcome(PAGE_WALK);
+                schedule(tlb_event, curTick() + ticks(missLatency2));
+            }
+        } else if (outcome == PAGE_WALK) {
+            if (update_stats)
+                pageTableCycles += (req_cnt*curTick());
+
+            // Need to access the page table and update the TLB
+            DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
+                    virtPageAddr);
+
+            TranslationState *sender_state =
+                safe_cast<TranslationState*>(pkt->senderState);
+
+            Process *p = sender_state->tc->getProcessPtr();
+            TlbEntry newEntry;
+            Addr vaddr = pkt->req->getVaddr();
+    #ifndef NDEBUG
+            Addr alignedVaddr = p->pTable->pageAlign(vaddr);
+            assert(alignedVaddr == virtPageAddr);
+    #endif
+            bool success;
+            success = p->pTable->lookup(vaddr, newEntry);
+            if (!success && sender_state->tlbMode != BaseTLB::Execute) {
+                if (p->fixupStackFault(vaddr)) {
+                    success = p->pTable->lookup(vaddr, newEntry);
+                }
+            }
+
+            DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
+                    newEntry.pageStart());
+
+            sender_state->tlbEntry =
+                new GpuTlbEntry(0, newEntry.vaddr, newEntry.paddr, success);
+
+            handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
+        } else if (outcome == MISS_RETURN) {
+            /** we add an extra cycle in the return path of the translation
+             * requests in between the various TLB levels.
+             */
+            handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
+        } else {
+            assert(false);
+        }
+    }
+
+    void
+    GpuTLB::TLBEvent::process()
+    {
+        tlb->translationReturn(virtPageAddr, outcome, pkt);
+    }
+
+    const char*
+    GpuTLB::TLBEvent::description() const
+    {
+        return "trigger translationDoneEvent";
+    }
+
+    void
+    GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome)
+    {
+        outcome = _outcome;
+    }
+
+    Addr
+    GpuTLB::TLBEvent::getTLBEventVaddr()
+    {
+        return virtPageAddr;
+    }
+
+    /*
+     * recvTiming receives a coalesced timing request from a TLBCoalescer
+     * and it calls issueTLBLookup()
+     * It only rejects the packet if we have exceeded the max
+     * outstanding number of requests for the TLB
+     */
+    bool
+    GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt)
+    {
+        if (tlb->outstandingReqs < tlb->maxCoalescedReqs) {
+            tlb->issueTLBLookup(pkt);
+            // update number of outstanding translation requests
+            tlb->outstandingReqs++;
+            return true;
+         } else {
+            DPRINTF(GPUTLB, "Reached maxCoalescedReqs number %d\n",
+                    tlb->outstandingReqs);
+            return false;
+         }
+    }
+
+    /**
+     * handleFuncTranslationReturn is called on a TLB hit,
+     * when a TLB miss returns or when a page fault returns.
+     * It updates LRU, inserts the TLB entry on a miss
+     * depending on the allocation policy and does the required
+     * protection checks. It does NOT create a new packet to
+     * update the packet's addr; this is done in hsail-gpu code.
+     */
+    void
+    GpuTLB::handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome tlb_outcome)
+    {
+        TranslationState *sender_state =
+            safe_cast<TranslationState*>(pkt->senderState);
+
+        ThreadContext *tc = sender_state->tc;
+        Mode mode = sender_state->tlbMode;
+        Addr vaddr = pkt->req->getVaddr();
+
+        GpuTlbEntry *local_entry, *new_entry;
+
+        if (tlb_outcome == TLB_HIT) {
+            DPRINTF(GPUTLB, "Functional Translation Done - TLB hit for addr "
+                    "%#x\n", vaddr);
+
+            local_entry = sender_state->tlbEntry;
+        } else {
+            DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
+                    "%#x\n", vaddr);
+
+            // We are returning either from a page walk or from a hit at a lower
+            // TLB level. The senderState should be "carrying" a pointer to the
+            // correct TLBEntry.
+            new_entry = sender_state->tlbEntry;
+            assert(new_entry);
+            local_entry = new_entry;
+
+            if (allocationPolicy) {
+                Addr virt_page_addr = roundDown(vaddr, TheISA::PageBytes);
+
+                DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
+                        virt_page_addr);
+
+                local_entry = insert(virt_page_addr, *new_entry);
+            }
+
+            assert(local_entry);
+        }
+
+        DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
+                "while paddr was %#x.\n", local_entry->vaddr,
+                local_entry->paddr);
+
+        // Do paging checks if it's a normal functional access.  If it's for a
+        // prefetch, then sometimes you can try to prefetch something that won't
+        // pass protection. We don't actually want to fault becuase there is no
+        // demand access to deem this a violation.  Just put it in the TLB and
+        // it will fault if indeed a future demand access touches it in
+        // violation.
+        if (!sender_state->prefetch && sender_state->tlbEntry->valid)
+            pagingProtectionChecks(tc, pkt, local_entry, mode);
+
+        int page_size = local_entry->size();
+        Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
+        DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
+
+        pkt->req->setPaddr(paddr);
+
+        if (local_entry->uncacheable)
+             pkt->req->setFlags(Request::UNCACHEABLE);
+    }
+
+    // This is used for atomic translations. Need to
+    // make it all happen during the same cycle.
+    void
+    GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt)
+    {
+        TranslationState *sender_state =
+            safe_cast<TranslationState*>(pkt->senderState);
+
+        ThreadContext *tc = sender_state->tc;
+        bool update_stats = !sender_state->prefetch;
+
+        Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
+                                        TheISA::PageBytes);
+
+        if (update_stats)
+            tlb->updatePageFootprint(virt_page_addr);
+
+        // do the TLB lookup without updating the stats
+        bool success = tlb->tlbLookup(pkt->req, tc, update_stats);
+        tlbOutcome tlb_outcome = success ? TLB_HIT : TLB_MISS;
+
+        // functional mode means no coalescing
+        // global metrics are the same as the local metrics
+        if (update_stats) {
+            tlb->globalNumTLBAccesses++;
+
+            if (success) {
+                sender_state->hitLevel = sender_state->reqCnt.size();
+                tlb->globalNumTLBHits++;
+            }
+        }
+
+        if (!success) {
+            if (update_stats)
+                tlb->globalNumTLBMisses++;
+            if (tlb->hasMemSidePort) {
+                // there is a TLB below -> propagate down the TLB hierarchy
+                tlb->memSidePort[0]->sendFunctional(pkt);
+                // If no valid translation from a prefetch, then just return
+                if (sender_state->prefetch && !pkt->req->hasPaddr())
+                    return;
+            } else {
+                // Need to access the page table and update the TLB
+                DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
+                        virt_page_addr);
+
+                Process *p = tc->getProcessPtr();
+                TlbEntry newEntry;
+
+                Addr vaddr = pkt->req->getVaddr();
+    #ifndef NDEBUG
+                Addr alignedVaddr = p->pTable->pageAlign(vaddr);
+                assert(alignedVaddr == virt_page_addr);
+    #endif
+
+                bool success = p->pTable->lookup(vaddr, newEntry);
+                if (!success && sender_state->tlbMode != BaseTLB::Execute) {
+                    if (p->fixupStackFault(vaddr))
+                        success = p->pTable->lookup(vaddr, newEntry);
+                }
+
+                if (!sender_state->prefetch) {
+                    // no PageFaults are permitted after
+                    // the second page table lookup
+                    assert(success);
+
+                    DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
+                           newEntry.pageStart());
+
+                    sender_state->tlbEntry = new GpuTlbEntry(0, newEntry.vaddr,
+                                                             newEntry.paddr,
+                                                             success);
+                } else {
+                    // If this was a prefetch, then do the normal thing if it
+                    // was a successful translation.  Otherwise, send an empty
+                    // TLB entry back so that it can be figured out as empty and
+                    // handled accordingly.
+                    if (success) {
+                        DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
+                               newEntry.pageStart());
+
+                        sender_state->tlbEntry = new GpuTlbEntry(0,
+                                                                 newEntry.vaddr,
+                                                                 newEntry.paddr,
+                                                                 success);
+                    } else {
+                        DPRINTF(GPUPrefetch, "Prefetch failed %#x\n",
+                                alignedVaddr);
+
+                        sender_state->tlbEntry = new GpuTlbEntry();
+
+                        return;
+                    }
+                }
+            }
+        } else {
+            DPRINTF(GPUPrefetch, "Functional Hit for vaddr %#x\n",
+                    tlb->lookup(pkt->req->getVaddr()));
+
+            GpuTlbEntry *entry = tlb->lookup(pkt->req->getVaddr(),
+                                             update_stats);
+
+            assert(entry);
+
+            sender_state->tlbEntry =
+                new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid);
+        }
+        // This is the function that would populate pkt->req with the paddr of
+        // the translation. But if no translation happens (i.e Prefetch fails)
+        // then the early returns in the above code wiill keep this function
+        // from executing.
+        tlb->handleFuncTranslationReturn(pkt, tlb_outcome);
+    }
+
+    void
+    GpuTLB::CpuSidePort::recvReqRetry()
+    {
+        // The CPUSidePort never sends anything but replies. No retries
+        // expected.
+        assert(false);
+    }
+
+    AddrRangeList
+    GpuTLB::CpuSidePort::getAddrRanges() const
+    {
+        // currently not checked by the master
+        AddrRangeList ranges;
+
+        return ranges;
+    }
+
+    /**
+     * MemSidePort receives the packet back.
+     * We need to call the handleTranslationReturn
+     * and propagate up the hierarchy.
+     */
+    bool
+    GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt)
+    {
+        Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
+                                        TheISA::PageBytes);
+
+        DPRINTF(GPUTLB, "MemSidePort recvTiming for virt_page_addr %#x\n",
+                virt_page_addr);
+
+        TLBEvent *tlb_event = tlb->translationReturnEvent[virt_page_addr];
+        assert(tlb_event);
+        assert(virt_page_addr == tlb_event->getTLBEventVaddr());
+
+        tlb_event->updateOutcome(MISS_RETURN);
+        tlb->schedule(tlb_event, curTick()+tlb->ticks(1));
+
+        return true;
+    }
+
+    void
+    GpuTLB::MemSidePort::recvReqRetry()
+    {
+        // No retries should reach the TLB. The retries
+        // should only reach the TLBCoalescer.
+        assert(false);
+    }
+
+    void
+    GpuTLB::cleanup()
+    {
+        while (!cleanupQueue.empty()) {
+            Addr cleanup_addr = cleanupQueue.front();
+            cleanupQueue.pop();
+
+            // delete TLBEvent
+            TLBEvent * old_tlb_event = translationReturnEvent[cleanup_addr];
+            delete old_tlb_event;
+            translationReturnEvent.erase(cleanup_addr);
+
+            // update number of outstanding requests
+            outstandingReqs--;
+        }
+
+        /** the higher level coalescer should retry if it has
+         * any pending requests.
+         */
+        for (int i = 0; i < cpuSidePort.size(); ++i) {
+            cpuSidePort[i]->sendRetryReq();
+        }
+    }
+
+    void
+    GpuTLB::updatePageFootprint(Addr virt_page_addr)
+    {
+
+        std::pair<AccessPatternTable::iterator, bool> ret;
+
+        AccessInfo tmp_access_info;
+        tmp_access_info.lastTimeAccessed = 0;
+        tmp_access_info.accessesPerPage = 0;
+        tmp_access_info.totalReuseDistance = 0;
+        tmp_access_info.sumDistance = 0;
+        tmp_access_info.meanDistance = 0;
+
+        ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr,
+                                  tmp_access_info));
+
+        bool first_page_access = ret.second;
+
+        if (first_page_access) {
+            numUniquePages++;
+        } else  {
+            int accessed_before;
+            accessed_before  = curTick() - ret.first->second.lastTimeAccessed;
+            ret.first->second.totalReuseDistance += accessed_before;
+        }
+
+        ret.first->second.accessesPerPage++;
+        ret.first->second.lastTimeAccessed = curTick();
+
+        if (accessDistance) {
+            ret.first->second.localTLBAccesses
+                .push_back(localNumTLBAccesses.value());
+        }
+    }
+
+    void
+    GpuTLB::exitCallback()
+    {
+        std::ostream *page_stat_file = nullptr;
+
+        if (accessDistance) {
+
+            // print per page statistics to a separate file (.csv format)
+            // simout is the gem5 output directory (default is m5out or the one
+            // specified with -d
+            page_stat_file = simout.create(name().c_str());
+
+            // print header
+            *page_stat_file << "page,max_access_distance,mean_access_distance, "
+                            << "stddev_distance" << std::endl;
+        }
+
+        // update avg. reuse distance footprint
+        AccessPatternTable::iterator iter, iter_begin, iter_end;
+        unsigned int sum_avg_reuse_distance_per_page = 0;
+
+        // iterate through all pages seen by this TLB
+        for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) {
+            sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance /
+                                               iter->second.accessesPerPage;
+
+            if (accessDistance) {
+                unsigned int tmp = iter->second.localTLBAccesses[0];
+                unsigned int prev = tmp;
+
+                for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
+                    if (i) {
+                        tmp = prev + 1;
+                    }
+
+                    prev = iter->second.localTLBAccesses[i];
+                    // update the localTLBAccesses value
+                    // with the actual differece
+                    iter->second.localTLBAccesses[i] -= tmp;
+                    // compute the sum of AccessDistance per page
+                    // used later for mean
+                    iter->second.sumDistance +=
+                        iter->second.localTLBAccesses[i];
+                }
+
+                iter->second.meanDistance =
+                    iter->second.sumDistance / iter->second.accessesPerPage;
+
+                // compute std_dev and max  (we need a second round because we
+                // need to know the mean value
+                unsigned int max_distance = 0;
+                unsigned int stddev_distance = 0;
+
+                for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
+                    unsigned int tmp_access_distance =
+                        iter->second.localTLBAccesses[i];
+
+                    if (tmp_access_distance > max_distance) {
+                        max_distance = tmp_access_distance;
+                    }
+
+                    unsigned int diff =
+                        tmp_access_distance - iter->second.meanDistance;
+                    stddev_distance += pow(diff, 2);
+
+                }
+
+                stddev_distance =
+                    sqrt(stddev_distance/iter->second.accessesPerPage);
+
+                if (page_stat_file) {
+                    *page_stat_file << std::hex << iter->first << ",";
+                    *page_stat_file << std::dec << max_distance << ",";
+                    *page_stat_file << std::dec << iter->second.meanDistance
+                                    << ",";
+                    *page_stat_file << std::dec << stddev_distance;
+                    *page_stat_file << std::endl;
+                }
+
+                // erase the localTLBAccesses array
+                iter->second.localTLBAccesses.clear();
+            }
+        }
+
+        if (!TLBFootprint.empty()) {
+            avgReuseDistance =
+                sum_avg_reuse_distance_per_page / TLBFootprint.size();
+        }
+
+        //clear the TLBFootprint map
+        TLBFootprint.clear();
+    }
+} // namespace X86ISA
+
+X86ISA::GpuTLB*
+X86GPUTLBParams::create()
+{
+    return new X86ISA::GpuTLB(this);
+}
+
diff --git a/src/gpu-compute/gpu_tlb.hh b/src/gpu-compute/gpu_tlb.hh
new file mode 100644
index 000000000..3549c598b
--- /dev/null
+++ b/src/gpu-compute/gpu_tlb.hh
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#ifndef __GPU_TLB_HH__
+#define __GPU_TLB_HH__
+
+#include <fstream>
+#include <list>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "arch/generic/tlb.hh"
+#include "arch/x86/pagetable.hh"
+#include "arch/x86/pagetable_walker.hh"
+#include "arch/x86/regs/segment.hh"
+#include "base/callback.hh"
+#include "base/misc.hh"
+#include "base/statistics.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/X86GPUTLB.hh"
+#include "sim/sim_object.hh"
+
+class BaseTLB;
+class Packet;
+class ThreadContext;
+
+namespace X86ISA
+{
+    class GpuTlbEntry : public TlbEntry
+    {
+      public:
+        GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid)
+          : TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { }
+
+        GpuTlbEntry() : TlbEntry() { }
+
+        bool valid;
+    };
+
+    class GpuTLB : public MemObject
+    {
+      protected:
+        friend class Walker;
+
+        typedef std::list<GpuTlbEntry*> EntryList;
+
+        uint32_t configAddress;
+
+        // TLB clock: will inherit clock from shader's clock period in terms
+        // of nuber of ticks of curTime (aka global simulation clock)
+        // The assignment of TLB clock from shader clock is done in the python
+        // config files.
+        int clock;
+
+      public:
+        // clock related functions ; maps to-and-from Simulation ticks and
+        // object clocks.
+        Tick frequency() const { return SimClock::Frequency / clock; }
+
+        Tick
+        ticks(int numCycles) const
+        {
+            return (Tick)clock * numCycles;
+        }
+
+        Tick curCycle() const { return curTick() / clock; }
+        Tick tickToCycles(Tick val) const { return val / clock;}
+
+        typedef X86GPUTLBParams Params;
+        GpuTLB(const Params *p);
+        ~GpuTLB();
+
+        typedef enum BaseTLB::Mode Mode;
+
+        class Translation
+        {
+          public:
+            virtual ~Translation() { }
+
+            /**
+             * Signal that the translation has been delayed due to a hw page
+             * table walk.
+             */
+            virtual void markDelayed() = 0;
+
+            /**
+             * The memory for this object may be dynamically allocated, and it
+             * may be responsible for cleaning itslef up which will happen in
+             * this function. Once it's called the object is no longer valid.
+             */
+            virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc,
+                    Mode mode) = 0;
+        };
+
+        void dumpAll();
+        GpuTlbEntry *lookup(Addr va, bool update_lru=true);
+        void setConfigAddress(uint32_t addr);
+
+      protected:
+        EntryList::iterator lookupIt(Addr va, bool update_lru=true);
+        Walker *walker;
+
+      public:
+        Walker *getWalker();
+        void invalidateAll();
+        void invalidateNonGlobal();
+        void demapPage(Addr va, uint64_t asn);
+
+      protected:
+        int size;
+        int assoc;
+        int numSets;
+
+        /**
+         *  true if this is a fully-associative TLB
+         */
+        bool FA;
+        Addr setMask;
+
+        /**
+         * Allocation Policy: true if we always allocate on a hit, false
+         * otherwise. Default is true.
+         */
+        bool allocationPolicy;
+
+        /**
+         * if true, then this is not the last level TLB
+         */
+        bool hasMemSidePort;
+
+        /**
+         * Print out accessDistance stats. One stat file
+         * per TLB.
+         */
+        bool accessDistance;
+
+        GpuTlbEntry *tlb;
+
+        /*
+         * It's a per-set list. As long as we have not reached
+         * the full capacity of the given set, grab an entry from
+         * the freeList.
+         */
+        std::vector<EntryList> freeList;
+
+        /**
+         * An entryList per set is the equivalent of an LRU stack;
+         * it's used to guide replacement decisions. The head of the list
+         * contains the MRU TLB entry of the given set. If the freeList
+         * for this set is empty, the last element of the list
+         * is evicted (i.e., dropped on the floor).
+         */
+        std::vector<EntryList> entryList;
+
+        Fault translateInt(RequestPtr req, ThreadContext *tc);
+
+        Fault translate(RequestPtr req, ThreadContext *tc,
+                Translation *translation, Mode mode, bool &delayedResponse,
+                bool timing, int &latency);
+
+      public:
+        // latencies for a TLB hit, miss and page fault
+        int hitLatency;
+        int missLatency1;
+        int missLatency2;
+
+        // local_stats are as seen from the TLB
+        // without taking into account coalescing
+        Stats::Scalar localNumTLBAccesses;
+        Stats::Scalar localNumTLBHits;
+        Stats::Scalar localNumTLBMisses;
+        Stats::Formula localTLBMissRate;
+
+        // global_stats are as seen from the
+        // CU's perspective taking into account
+        // all coalesced requests.
+        Stats::Scalar globalNumTLBAccesses;
+        Stats::Scalar globalNumTLBHits;
+        Stats::Scalar globalNumTLBMisses;
+        Stats::Formula globalTLBMissRate;
+
+        // from the CU perspective (global)
+        Stats::Scalar accessCycles;
+        // from the CU perspective (global)
+        Stats::Scalar pageTableCycles;
+        Stats::Scalar numUniquePages;
+        // from the perspective of this TLB
+        Stats::Scalar localCycles;
+        // from the perspective of this TLB
+        Stats::Formula localLatency;
+        // I take the avg. per page and then
+        // the avg. over all pages.
+        Stats::Scalar avgReuseDistance;
+
+        void regStats();
+        void updatePageFootprint(Addr virt_page_addr);
+        void printAccessPattern();
+
+
+        Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
+                              int &latency);
+
+        void translateTiming(RequestPtr req, ThreadContext *tc,
+                             Translation *translation, Mode mode,
+                             int &latency);
+
+        Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
+        Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
+
+        GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry);
+
+        // Checkpointing
+        virtual void serialize(CheckpointOut& cp) const;
+        virtual void unserialize(CheckpointIn& cp);
+        void issueTranslation();
+        enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
+        bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats);
+
+        void handleTranslationReturn(Addr addr, tlbOutcome outcome,
+                                     PacketPtr pkt);
+
+        void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
+
+        void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
+                                    GpuTlbEntry *tlb_entry, Mode mode);
+
+        void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry,
+                                 Addr phys_page_addr);
+
+        void issueTLBLookup(PacketPtr pkt);
+
+        // CpuSidePort is the TLB Port closer to the CPU/CU side
+        class CpuSidePort : public SlavePort
+        {
+          public:
+            CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
+                        PortID _index)
+                : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
+
+          protected:
+            GpuTLB *tlb;
+            int index;
+
+            virtual bool recvTimingReq(PacketPtr pkt);
+            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+            virtual void recvFunctional(PacketPtr pkt);
+            virtual void recvRangeChange() { }
+            virtual void recvReqRetry();
+            virtual void recvRespRetry() { assert(false); }
+            virtual AddrRangeList getAddrRanges() const;
+        };
+
+        /**
+         * MemSidePort is the TLB Port closer to the memory side
+         * If this is a last level TLB then this port will not be connected.
+         *
+         * Future action item: if we ever do real page walks, then this port
+         * should be connected to a RubyPort.
+         */
+        class MemSidePort : public MasterPort
+        {
+          public:
+            MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
+                        PortID _index)
+                : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
+
+            std::deque<PacketPtr> retries;
+
+          protected:
+            GpuTLB *tlb;
+            int index;
+
+            virtual bool recvTimingResp(PacketPtr pkt);
+            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+            virtual void recvFunctional(PacketPtr pkt) { }
+            virtual void recvRangeChange() { }
+            virtual void recvReqRetry();
+        };
+
+        // TLB ports on the cpu Side
+        std::vector<CpuSidePort*> cpuSidePort;
+        // TLB ports on the memory side
+        std::vector<MemSidePort*> memSidePort;
+
+        BaseMasterPort &getMasterPort(const std::string &if_name,
+                                      PortID idx=InvalidPortID);
+
+        BaseSlavePort &getSlavePort(const std::string &if_name,
+                                    PortID idx=InvalidPortID);
+
+        /**
+         * TLB TranslationState: this currently is a somewhat bastardization of
+         * the usage of SenderState, whereby the receiver of a packet is not
+         * usually supposed to need to look at the contents of the senderState,
+         * you're really only supposed to look at what you pushed on, pop it
+         * off, and send it back.
+         *
+         * However, since there is state that we want to pass to the TLBs using
+         * the send/recv Timing/Functional/etc. APIs, which don't allow for new
+         * arguments, we need a common TLB senderState to pass between TLBs,
+         * both "forwards" and "backwards."
+         *
+         * So, basically, the rule is that any packet received by a TLB port
+         * (cpuside OR memside) must be safely castable to a TranslationState.
+         */
+
+        struct TranslationState : public Packet::SenderState
+        {
+            // TLB mode, read or write
+            Mode tlbMode;
+            // Thread context associated with this req
+            ThreadContext *tc;
+
+            /*
+            * TLB entry to be populated and passed back and filled in
+            * previous TLBs.  Equivalent to the data cache concept of
+            * "data return."
+            */
+            GpuTlbEntry *tlbEntry;
+            // Is this a TLB prefetch request?
+            bool prefetch;
+            // When was the req for this translation issued
+            uint64_t issueTime;
+            // Remember where this came from
+            std::vector<SlavePort*>ports;
+
+            // keep track of #uncoalesced reqs per packet per TLB level;
+            // reqCnt per level >= reqCnt higher level
+            std::vector<int> reqCnt;
+            // TLB level this packet hit in; 0 if it hit in the page table
+            int hitLevel;
+            Packet::SenderState *saved;
+
+            TranslationState(Mode tlb_mode, ThreadContext *_tc,
+                             bool _prefetch=false,
+                             Packet::SenderState *_saved=nullptr)
+                : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
+                  prefetch(_prefetch), issueTime(0),
+                  hitLevel(0),saved(_saved) { }
+        };
+
+        // maximum number of permitted coalesced requests per cycle
+        int maxCoalescedReqs;
+
+        // Current number of outstandings coalesced requests.
+        // Should be <= maxCoalescedReqs
+        int outstandingReqs;
+
+        /**
+         * A TLBEvent is scheduled after the TLB lookup and helps us take the
+         * appropriate actions:
+         *  (e.g., update TLB on a hit,
+         *  send request to lower level TLB on a miss,
+         *  or start a page walk if this was the last-level TLB).
+         */
+        void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
+                               PacketPtr pkt);
+
+        class TLBEvent : public Event
+        {
+            private:
+                GpuTLB *tlb;
+                Addr virtPageAddr;
+                /**
+                 * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
+                 */
+                tlbOutcome outcome;
+                PacketPtr pkt;
+
+            public:
+                TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
+                        PacketPtr _pkt);
+
+                void process();
+                const char *description() const;
+
+                // updateOutcome updates the tlbOutcome of a TLBEvent
+                void updateOutcome(tlbOutcome _outcome);
+                Addr getTLBEventVaddr();
+        };
+
+        std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
+
+        // this FIFO queue keeps track of the virt. page addresses
+        // that are pending cleanup
+        std::queue<Addr> cleanupQueue;
+
+        // the cleanupEvent is scheduled after a TLBEvent triggers in order to
+        // free memory and do the required clean-up
+        void cleanup();
+
+        EventWrapper<GpuTLB, &GpuTLB::cleanup> cleanupEvent;
+
+        /**
+         * This hash map will use the virtual page address as a key
+         * and will keep track of total number of accesses per page
+         */
+
+        struct AccessInfo
+        {
+            unsigned int lastTimeAccessed; // last access to this page
+            unsigned int accessesPerPage;
+            // need to divide it by accessesPerPage at the end
+            unsigned int totalReuseDistance;
+
+            /**
+             * The field below will help us compute the access distance,
+             * that is the number of (coalesced) TLB accesses that
+             * happened in between each access to this page
+             *
+             * localTLBAccesses[x] is the value of localTLBNumAccesses
+             * when the page <Addr> was accessed for the <x>th time
+             */
+            std::vector<unsigned int> localTLBAccesses;
+            unsigned int sumDistance;
+            unsigned int meanDistance;
+        };
+
+        typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
+        AccessPatternTable TLBFootprint;
+
+        // Called at the end of simulation to dump page access stats.
+        void exitCallback();
+
+        EventWrapper<GpuTLB, &GpuTLB::exitCallback> exitEvent;
+    };
+}
+
+#endif // __GPU_TLB_HH__
diff --git a/src/gpu-compute/hsa_code.hh b/src/gpu-compute/hsa_code.hh
new file mode 100644
index 000000000..9f358e23c
--- /dev/null
+++ b/src/gpu-compute/hsa_code.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __HSA_CODE_HH__
+#define __HSA_CODE_HH__
+
+#include <string>
+#include <vector>
+
+#include "arch/gpu_types.hh"
+#include "config/the_gpu_isa.hh"
+
+class HsaKernelInfo;
+
+/* @class HsaCode
+ * base code object for the set of HSA kernels associated
+ * with a single application. this class provides the common
+ * methods for creating, accessing, and storing information
+ * about kernel and variable symbols, symbol name, memory
+ * segment sizes, and instruction count, etc.
+ */
+
+class HsaCode
+{
+  public:
+    HsaCode(const std::string &name) : readonly_data(nullptr), funcarg_size(0),
+                                       _name(name)
+    {
+    }
+
+    enum class MemorySegment {
+        NONE,
+        FLAT,
+        GLOBAL,
+        READONLY,
+        KERNARG,
+        GROUP,
+        PRIVATE,
+        SPILL,
+        ARG,
+        EXTSPACE0
+    };
+
+    const std::string& name() const { return _name; }
+    int numInsts() const { return _insts.size(); }
+    std::vector<TheGpuISA::RawMachInst>* insts() { return &_insts; }
+
+    void
+    setReadonlyData(uint8_t *_readonly_data)
+    {
+        readonly_data = _readonly_data;
+    }
+
+    virtual int getSize(MemorySegment segment) const = 0;
+    virtual void generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const = 0;
+
+    uint8_t *readonly_data;
+    int funcarg_size;
+
+  protected:
+    // An array that stores instruction indices (0 through kernel size)
+    // for a kernel passed to code object constructor as an argument.
+    std::vector<TheGpuISA::RawMachInst> _insts;
+
+  private:
+    const std::string _name;
+};
+
+#endif // __HSA_CODE_HH__
diff --git a/src/gpu-compute/hsa_kernel_info.hh b/src/gpu-compute/hsa_kernel_info.hh
new file mode 100644
index 000000000..396913dac
--- /dev/null
+++ b/src/gpu-compute/hsa_kernel_info.hh
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __HSA_KERNEL_INFO_HH__
+#define __HSA_KERNEL_INFO_HH__
+
+// This file defines the public interface between the HSA emulated
+// driver and application programs.
+
+#include <cstdint>
+
+static const int HSA_GET_SIZES = 0x4801;
+static const int HSA_GET_KINFO = 0x4802;
+static const int HSA_GET_STRINGS = 0x4803;
+static const int HSA_GET_CODE = 0x4804;
+static const int HSA_GET_READONLY_DATA = 0x4805;
+static const int HSA_GET_CU_CNT = 0x4806;
+static const int HSA_GET_VSZ = 0x4807;
+
+// Return value (via buffer ptr) for HSA_GET_SIZES
+struct HsaDriverSizes
+{
+    uint32_t num_kernels;
+    uint32_t string_table_size;
+    uint32_t code_size;
+    uint32_t readonly_size;
+};
+
+// HSA_GET_KINFO returns an array of num_kernels of these structs
+struct HsaKernelInfo
+{
+    // byte offset into string table
+    uint32_t name_offs;
+    // byte offset into code array
+    uint32_t code_offs;
+    uint32_t static_lds_size;
+    uint32_t private_mem_size;
+    uint32_t spill_mem_size;
+    // Number of s registers
+    uint32_t sRegCount;
+    // Number of d registers
+    uint32_t dRegCount;
+    // Number of c registers
+    uint32_t cRegCount;
+};
+
+#endif // __HSA_KERNEL_INFO_HH__
diff --git a/src/gpu-compute/hsa_object.cc b/src/gpu-compute/hsa_object.cc
new file mode 100644
index 000000000..91dfb160e
--- /dev/null
+++ b/src/gpu-compute/hsa_object.cc
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/hsa_object.hh"
+
+#include <fstream>
+
+#include "gpu-compute/brig_object.hh"
+
+HsaObject::HsaObject(const std::string &fname)
+    : readonlyData(nullptr), filename(fname)
+{
+}
+
+HsaObject*
+HsaObject::createHsaObject(const std::string &fname)
+{
+    HsaObject *hsaObj = nullptr;
+    uint8_t *file_data = nullptr;
+    int file_length = 0;
+
+    std::ifstream code_file(fname, std::ifstream::ate | std::ifstream::in |
+                            std::ifstream::binary);
+
+    assert(code_file.is_open());
+    assert(code_file.good());
+
+    file_length = code_file.tellg();
+    code_file.seekg(0, code_file.beg);
+    file_data = new uint8_t[file_length];
+    code_file.read((char*)file_data, file_length);
+    code_file.close();
+
+    for (const auto &tryFile : tryFileFuncs) {
+        if ((hsaObj = tryFile(fname, file_length, file_data))) {
+            return hsaObj;
+        }
+    }
+
+    delete[] file_data;
+    fatal("Unknown HSA object type for file: %s.\n", fname);
+
+    return nullptr;
+}
diff --git a/src/gpu-compute/hsa_object.hh b/src/gpu-compute/hsa_object.hh
new file mode 100644
index 000000000..1f08f5d80
--- /dev/null
+++ b/src/gpu-compute/hsa_object.hh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __HSA_OBJECT_HH__
+#define __HSA_OBJECT_HH__
+
+#include <functional>
+#include <string>
+#include <vector>
+
+class HsaCode;
+
+/* @class HsaObject
+ * base loader object for HSA kernels. this class provides
+ * the base method definitions for loading, storing, and
+ * accessing HSA kernel objects into the simulator.
+ */
+
+class HsaObject
+{
+  public:
+    HsaObject(const std::string &fileName);
+
+    static HsaObject* createHsaObject(const std::string &fname);
+    static std::vector<std::function<HsaObject*(const std::string&, int,
+                                                uint8_t*)>> tryFileFuncs;
+
+    virtual HsaCode* getKernel(const std::string &name) const = 0;
+    virtual HsaCode* getKernel(int i) const = 0;
+    virtual HsaCode* getFunction(const std::string &name) const = 0;
+    virtual int numKernels() const = 0;
+
+    const std::string& name() const { return filename; }
+
+    uint8_t *readonlyData;
+
+
+  protected:
+    const std::string filename;
+};
+
+#endif // __HSA_OBJECT_HH__
diff --git a/src/gpu-compute/hsail_code.cc b/src/gpu-compute/hsail_code.cc
new file mode 100644
index 000000000..b0ddf0161
--- /dev/null
+++ b/src/gpu-compute/hsail_code.cc
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "gpu-compute/hsail_code.hh"
+
+#include "arch/gpu_types.hh"
+#include "arch/hsail/Brig.h"
+#include "arch/hsail/operand.hh"
+#include "config/the_gpu_isa.hh"
+#include "debug/BRIG.hh"
+#include "debug/HSAILObject.hh"
+#include "gpu-compute/brig_object.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/kernel_cfg.hh"
+
+using namespace Brig;
+
+int getBrigDataTypeBytes(BrigType16_t t);
+
+HsailCode::HsailCode(const std::string &name_str)
+    : HsaCode(name_str), private_size(-1), readonly_size(-1)
+{
+}
+
+void
+HsailCode::init(const BrigDirectiveExecutable *code_dir, const BrigObject *obj,
+                StorageMap *objStorageMap)
+{
+    storageMap = objStorageMap;
+
+    // set pointer so that decoding process can find this kernel context when
+    // needed
+    obj->currentCode = this;
+
+    if (code_dir->base.kind != BRIG_KIND_DIRECTIVE_FUNCTION &&
+        code_dir->base.kind != BRIG_KIND_DIRECTIVE_KERNEL) {
+        fatal("unexpected directive kind %d inside kernel/function init\n",
+              code_dir->base.kind);
+    }
+
+    DPRINTF(HSAILObject, "Initializing code, first code block entry is: %d\n",
+            code_dir->firstCodeBlockEntry);
+
+    // clear these static vars so we can properly track the max index
+    // for this kernel
+    SRegOperand::maxRegIdx = 0;
+    DRegOperand::maxRegIdx = 0;
+    CRegOperand::maxRegIdx = 0;
+    setPrivateSize(0);
+
+    const BrigBase *entryPtr = brigNext((BrigBase*)code_dir);
+    const BrigBase *endPtr =
+        obj->getCodeSectionEntry(code_dir->nextModuleEntry);
+
+    int inst_idx = 0;
+    std::vector<GPUStaticInst*> instructions;
+    int funcarg_size_scope = 0;
+
+    // walk through instructions in code section and directives in
+    // directive section in parallel, processing directives that apply
+    // when we reach the relevant code point.
+    while (entryPtr < endPtr) {
+        switch (entryPtr->kind) {
+          case BRIG_KIND_DIRECTIVE_VARIABLE:
+           {
+                const BrigDirectiveVariable *sym =
+                    (const BrigDirectiveVariable*)entryPtr;
+
+                DPRINTF(HSAILObject,"Initializing code, directive is "
+                        "kind_variable, symbol is: %s\n",
+                        obj->getString(sym->name));
+
+                StorageElement *se = storageMap->addSymbol(sym, obj);
+
+                if (sym->segment == BRIG_SEGMENT_PRIVATE) {
+                    setPrivateSize(se->size);
+                } else { // spill
+                    funcarg_size_scope += se->size;
+                }
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_LABEL:
+            {
+                const BrigDirectiveLabel *lbl =
+                    (const BrigDirectiveLabel*)entryPtr;
+
+                DPRINTF(HSAILObject,"Initializing code, directive is "
+                        "kind_label, label is: %s \n",
+                        obj->getString(lbl->name));
+
+                labelMap.addLabel(lbl, inst_idx, obj);
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_PRAGMA:
+            {
+                DPRINTF(HSAILObject, "Initializing code, directive "
+                        "is kind_pragma\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_COMMENT:
+            {
+                DPRINTF(HSAILObject, "Initializing code, directive is "
+                        "kind_comment\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START:
+            {
+                DPRINTF(HSAILObject, "Initializing code, directive is "
+                        "kind_arg_block_start\n");
+
+                storageMap->resetOffset(BRIG_SEGMENT_ARG);
+                funcarg_size_scope = 0;
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END:
+            {
+                DPRINTF(HSAILObject, "Initializing code, directive is "
+                        "kind_arg_block_end\n");
+
+                funcarg_size = funcarg_size < funcarg_size_scope ?
+                                              funcarg_size_scope : funcarg_size;
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_END:
+            DPRINTF(HSAILObject, "Initializing code, dircetive is "
+                    "kind_end\n");
+
+            break;
+
+          default:
+            if (entryPtr->kind >= BRIG_KIND_INST_BEGIN &&
+                entryPtr->kind <= BRIG_KIND_INST_END) {
+
+                BrigInstBase *instPtr = (BrigInstBase*)entryPtr;
+                TheGpuISA::MachInst machInst = { instPtr, obj };
+                GPUStaticInst *iptr = decoder.decode(machInst);
+
+                if (iptr) {
+                    DPRINTF(HSAILObject, "Initializing code, processing inst "
+                            "#%d idx %d: OPCODE=%d\n",
+                            inst_idx,  _insts.size(), instPtr->opcode);
+
+                    TheGpuISA::RawMachInst inst_num = decoder.saveInst(iptr);
+                    iptr->instNum(inst_idx);
+                    _insts.push_back(inst_num);
+                    instructions.push_back(iptr);
+                }
+                ++inst_idx;
+            } else if (entryPtr->kind >= BRIG_KIND_OPERAND_BEGIN &&
+                       entryPtr->kind < BRIG_KIND_OPERAND_END) {
+                warn("unexpected operand entry in code segment\n");
+            } else {
+                // there are surely some more cases we will need to handle,
+                // but we'll deal with them as we find them.
+                fatal("unexpected directive kind %d inside kernel scope\n",
+                      entryPtr->kind);
+            }
+        }
+
+        entryPtr = brigNext(entryPtr);
+    }
+
+    // compute Control Flow Graph for current kernel
+    ControlFlowInfo::assignImmediatePostDominators(instructions);
+
+    max_sreg = SRegOperand::maxRegIdx;
+    max_dreg = DRegOperand::maxRegIdx;
+    max_creg = CRegOperand::maxRegIdx;
+
+    obj->currentCode = nullptr;
+}
+
+HsailCode::HsailCode(const std::string &name_str,
+                     const BrigDirectiveExecutable *code_dir,
+                     const BrigObject *obj, StorageMap *objStorageMap)
+    : HsaCode(name_str), private_size(-1), readonly_size(-1)
+{
+    init(code_dir, obj, objStorageMap);
+}
+
+void
+LabelMap::addLabel(const Brig::BrigDirectiveLabel *lblDir, int inst_index,
+                   const BrigObject *obj)
+{
+    std::string lbl_name = obj->getString(lblDir->name);
+    Label &lbl = map[lbl_name];
+
+    if (lbl.defined()) {
+        fatal("Attempt to redefine existing label %s\n", lbl_name);
+    }
+
+    lbl.define(lbl_name, inst_index);
+    DPRINTF(HSAILObject, "label %s = %d\n", lbl_name, inst_index);
+}
+
+Label*
+LabelMap::refLabel(const Brig::BrigDirectiveLabel *lblDir,
+                   const BrigObject *obj)
+{
+    std::string name = obj->getString(lblDir->name);
+    Label &lbl = map[name];
+    lbl.checkName(name);
+
+    return &lbl;
+}
+
+int
+getBrigDataTypeBytes(BrigType16_t t)
+{
+    switch (t) {
+      case BRIG_TYPE_S8:
+      case BRIG_TYPE_U8:
+      case BRIG_TYPE_B8:
+        return 1;
+
+      case BRIG_TYPE_S16:
+      case BRIG_TYPE_U16:
+      case BRIG_TYPE_B16:
+      case BRIG_TYPE_F16:
+        return 2;
+
+      case BRIG_TYPE_S32:
+      case BRIG_TYPE_U32:
+      case BRIG_TYPE_B32:
+      case BRIG_TYPE_F32:
+        return 4;
+
+      case BRIG_TYPE_S64:
+      case BRIG_TYPE_U64:
+      case BRIG_TYPE_B64:
+      case BRIG_TYPE_F64:
+        return 8;
+
+      case BRIG_TYPE_B1:
+
+      default:
+        fatal("unhandled symbol data type %d", t);
+        return 0;
+    }
+}
+
+StorageElement*
+StorageSpace::addSymbol(const BrigDirectiveVariable *sym,
+                        const BrigObject *obj)
+{
+    const char *sym_name = obj->getString(sym->name);
+    uint64_t size = 0;
+    uint64_t offset = 0;
+
+    if (sym->type & BRIG_TYPE_ARRAY) {
+        size = getBrigDataTypeBytes(sym->type & ~BRIG_TYPE_ARRAY);
+        size *= (((uint64_t)sym->dim.hi) << 32 | (uint64_t)sym->dim.lo);
+
+        offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type &
+                         ~BRIG_TYPE_ARRAY));
+    } else {
+        size = getBrigDataTypeBytes(sym->type);
+        offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type));
+    }
+
+    nextOffset = offset + size;
+
+    DPRINTF(HSAILObject, "Adding %s SYMBOL %s size %d offset 0x%x, init: %d\n",
+            segmentNames[segment], sym_name, size, offset, sym->init);
+
+    StorageElement* se = new StorageElement(sym_name, offset, size, sym);
+    elements.push_back(se);
+    elements_by_addr.insert(AddrRange(offset, offset + size - 1), se);
+    elements_by_brigptr[sym] = se;
+
+    return se;
+}
+
+StorageElement*
+StorageSpace::findSymbol(std::string name)
+{
+    for (auto it : elements) {
+        if (it->name == name) {
+            return it;
+        }
+    }
+
+    return nullptr;
+}
+
+StorageElement*
+StorageSpace::findSymbol(uint64_t addr)
+{
+    assert(elements_by_addr.size() > 0);
+
+    auto se = elements_by_addr.find(addr);
+
+    if (se == elements_by_addr.end()) {
+        return nullptr;
+    } else {
+        return se->second;
+    }
+}
+
+StorageElement*
+StorageSpace::findSymbol(const BrigDirectiveVariable *brigptr)
+{
+    assert(elements_by_brigptr.size() > 0);
+
+    auto se = elements_by_brigptr.find(brigptr);
+
+    if (se == elements_by_brigptr.end()) {
+        return nullptr;
+    } else {
+        return se->second;
+    }
+}
+
+StorageMap::StorageMap(StorageMap *outerScope)
+    : outerScopeMap(outerScope)
+{
+    for (int i = 0; i < NumSegments; ++i)
+        space[i] = new StorageSpace((BrigSegment)i);
+}
+
+StorageElement*
+StorageMap::addSymbol(const BrigDirectiveVariable *sym, const BrigObject *obj)
+{
+    BrigSegment8_t segment = sym->segment;
+
+    assert(segment >= Brig::BRIG_SEGMENT_FLAT);
+    assert(segment < NumSegments);
+
+    return space[segment]->addSymbol(sym, obj);
+}
+
+int
+StorageMap::getSize(Brig::BrigSegment segment)
+{
+    assert(segment > Brig::BRIG_SEGMENT_GLOBAL);
+    assert(segment < NumSegments);
+
+    if (segment != Brig::BRIG_SEGMENT_GROUP &&
+        segment != Brig::BRIG_SEGMENT_READONLY) {
+        return space[segment]->getSize();
+    } else {
+        int ret = space[segment]->getSize();
+
+        if (outerScopeMap) {
+            ret += outerScopeMap->getSize(segment);
+        }
+
+        return ret;
+    }
+}
+
+void
+StorageMap::resetOffset(Brig::BrigSegment segment)
+{
+    space[segment]->resetOffset();
+}
+
+StorageElement*
+StorageMap::findSymbol(BrigSegment segment, std::string name)
+{
+    StorageElement *se = space[segment]->findSymbol(name);
+
+    if (se)
+        return se;
+
+    if (outerScopeMap)
+        return outerScopeMap->findSymbol(segment, name);
+
+    return nullptr;
+}
+
+StorageElement*
+StorageMap::findSymbol(Brig::BrigSegment segment, uint64_t addr)
+{
+    StorageSpace *sp = space[segment];
+
+    if (!sp) {
+        // there is no memory in segment?
+        return nullptr;
+    }
+
+    StorageElement *se = sp->findSymbol(addr);
+
+    if (se)
+        return se;
+
+    if (outerScopeMap)
+        return outerScopeMap->findSymbol(segment, addr);
+
+    return nullptr;
+
+}
+
+StorageElement*
+StorageMap::findSymbol(Brig::BrigSegment segment,
+                       const BrigDirectiveVariable *brigptr)
+{
+    StorageSpace *sp = space[segment];
+
+    if (!sp) {
+        // there is no memory in segment?
+        return nullptr;
+    }
+
+    StorageElement *se = sp->findSymbol(brigptr);
+
+    if (se)
+        return se;
+
+    if (outerScopeMap)
+        return outerScopeMap->findSymbol(segment, brigptr);
+
+    return nullptr;
+
+}
diff --git a/src/gpu-compute/hsail_code.hh b/src/gpu-compute/hsail_code.hh
new file mode 100644
index 000000000..d9fbcc577
--- /dev/null
+++ b/src/gpu-compute/hsail_code.hh
@@ -0,0 +1,447 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __HSAIL_CODE_HH__
+#define __HSAIL_CODE_HH__
+
+#include <cassert>
+#include <list>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "arch/gpu_decoder.hh"
+#include "arch/hsail/Brig.h"
+#include "base/addr_range_map.hh"
+#include "base/intmath.hh"
+#include "config/the_gpu_isa.hh"
+#include "gpu-compute/hsa_code.hh"
+#include "gpu-compute/hsa_kernel_info.hh"
+#include "gpu-compute/misc.hh"
+
+class BrigObject;
+class GPUStaticInst;
+
+inline int
+popcount(uint64_t src, int sz)
+{
+    int cnt = 0;
+
+    for (int i = 0; i < sz; ++i) {
+        if (src & 1)
+            ++cnt;
+        src >>= 1;
+    }
+
+    return cnt;
+}
+
+inline int
+firstbit(uint64_t src, int sz)
+{
+    int i;
+
+    for (i = 0; i < sz; ++i) {
+        if (src & 1)
+            break;
+        src >>= 1;
+    }
+
+    return i;
+}
+
+inline int
+lastbit(uint64_t src, int sz)
+{
+    int i0 = -1;
+
+    for (int i = 0; i < sz; ++i) {
+        if (src & 1)
+            i0 = i;
+        src >>= 1;
+    }
+
+    return i0;
+}
+
+inline int
+signbit(uint64_t src, int sz)
+{
+    int i0 = -1;
+
+    if (src & (1 << (sz - 1))) {
+        for (int i = 0; i < sz - 1; ++i) {
+            if (!(src & 1))
+                i0 = i;
+            src >>= 1;
+        }
+    } else {
+        for (int i = 0; i < sz - 1; ++i) {
+            if (src & 1)
+                i0 = i;
+            src >>= 1;
+        }
+    }
+
+    return i0;
+}
+
+inline uint64_t
+bitrev(uint64_t src, int sz)
+{
+    uint64_t r = 0;
+
+    for (int i = 0; i < sz; ++i) {
+        r <<= 1;
+        if (src & 1)
+            r |= 1;
+        src >>= 1;
+    }
+
+    return r;
+}
+
+inline uint64_t
+mul_hi(uint32_t a, uint32_t b)
+{
+    return ((uint64_t)a * (uint64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(int32_t a, int32_t b)
+{
+    return ((int64_t)a * (int64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(uint64_t a, uint64_t b)
+{
+    return ((uint64_t)a * (uint64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(int64_t a, int64_t b)
+{
+    return ((int64_t)a * (int64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(double a, double b)
+{
+    return 0;
+}
+
+class Label
+{
+  public:
+    std::string name;
+    int value;
+
+    Label() : value(-1)
+    {
+    }
+
+    bool defined() { return value != -1; }
+
+    void
+    checkName(std::string &_name)
+    {
+        if (name.empty()) {
+            name = _name;
+        } else {
+            assert(name == _name);
+        }
+    }
+
+    void
+    define(std::string &_name, int _value)
+    {
+        assert(!defined());
+        assert(_value != -1);
+        value = _value;
+        checkName(_name);
+    }
+
+    int
+    get()
+    {
+        assert(defined());
+        return value;
+    }
+};
+
+class LabelMap
+{
+    std::map<std::string, Label> map;
+
+  public:
+    LabelMap() { }
+
+    void addLabel(const Brig::BrigDirectiveLabel *lbl, int inst_index,
+                  const BrigObject *obj);
+
+    Label *refLabel(const Brig::BrigDirectiveLabel *lbl,
+                    const BrigObject *obj);
+};
+
+const int NumSegments = Brig::BRIG_SEGMENT_AMD_GCN;
+
+extern const char *segmentNames[];
+
+class StorageElement
+{
+  public:
+    std::string name;
+    uint64_t offset;
+
+    uint64_t size;
+    const Brig::BrigDirectiveVariable *brigSymbol;
+    StorageElement(const char *_name, uint64_t _offset, int _size,
+                   const Brig::BrigDirectiveVariable *sym)
+        : name(_name), offset(_offset), size(_size), brigSymbol(sym)
+    {
+    }
+};
+
+class StorageSpace
+{
+    typedef std::map<const Brig::BrigDirectiveVariable*, StorageElement*>
+            DirVarToSE_map;
+
+    std::list<StorageElement*> elements;
+    AddrRangeMap<StorageElement*> elements_by_addr;
+    DirVarToSE_map elements_by_brigptr;
+
+    uint64_t nextOffset;
+    Brig::BrigSegment segment;
+
+  public:
+    StorageSpace(Brig::BrigSegment _class)
+        : nextOffset(0), segment(_class)
+    {
+    }
+
+    StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym,
+                              const BrigObject *obj);
+
+    StorageElement* findSymbol(std::string name);
+    StorageElement* findSymbol(uint64_t addr);
+    StorageElement* findSymbol(const Brig::BrigDirectiveVariable *brigptr);
+
+    int getSize() { return nextOffset; }
+    void resetOffset() { nextOffset = 0; }
+};
+
+class StorageMap
+{
+    StorageMap *outerScopeMap;
+    StorageSpace *space[NumSegments];
+
+  public:
+    StorageMap(StorageMap *outerScope = nullptr);
+
+    StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym,
+                              const BrigObject *obj);
+
+    StorageElement* findSymbol(Brig::BrigSegment segment, std::string name);
+    StorageElement* findSymbol(Brig::BrigSegment segment, uint64_t addr);
+
+    StorageElement* findSymbol(Brig::BrigSegment segment,
+                               const Brig::BrigDirectiveVariable *brigptr);
+
+    // overloaded version to avoid casting
+    StorageElement*
+    findSymbol(Brig::BrigSegment8_t segment, std::string name)
+    {
+        return findSymbol((Brig::BrigSegment)segment, name);
+    }
+
+    int getSize(Brig::BrigSegment segment);
+    void resetOffset(Brig::BrigSegment segment);
+};
+
+typedef enum
+{
+    BT_DEFAULT,
+    BT_B8,
+    BT_U8,
+    BT_U16,
+    BT_U32,
+    BT_U64,
+    BT_S8,
+    BT_S16,
+    BT_S32,
+    BT_S64,
+    BT_F16,
+    BT_F32,
+    BT_F64,
+    BT_NULL
+} base_type_e;
+
+/* @class HsailCode
+ * the HsailCode class is used to store information
+ * about HSA kernels stored in the BRIG format. it holds
+ * all information about a kernel, function, or variable
+ * symbol and provides methods for accessing that
+ * information.
+ */
+
+class HsailCode final : public HsaCode
+{
+  public:
+    TheGpuISA::Decoder decoder;
+
+    StorageMap *storageMap;
+    LabelMap labelMap;
+    uint32_t kernarg_start;
+    uint32_t kernarg_end;
+    int32_t private_size;
+
+    int32_t readonly_size;
+
+    // We track the maximum register index used for each register
+    // class when we load the code so we can size the register files
+    // appropriately (i.e., one more than the max index).
+    uint32_t max_creg;    // maximum c-register index
+    uint32_t max_sreg;    // maximum s-register index
+    uint32_t max_dreg;    // maximum d-register index
+
+    HsailCode(const std::string &name_str,
+              const Brig::BrigDirectiveExecutable *code_dir,
+              const BrigObject *obj,
+              StorageMap *objStorageMap);
+
+    // this version is used to create a placeholder when
+    // we encounter a kernel-related directive before the
+    // kernel itself
+    HsailCode(const std::string &name_str);
+
+    void init(const Brig::BrigDirectiveExecutable *code_dir,
+              const BrigObject *obj, StorageMap *objStorageMap);
+
+    void
+    generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const
+    {
+        hsaKernelInfo->sRegCount = max_sreg + 1;
+        hsaKernelInfo->dRegCount = max_dreg + 1;
+        hsaKernelInfo->cRegCount = max_creg + 1;
+
+        hsaKernelInfo->static_lds_size = getSize(Brig::BRIG_SEGMENT_GROUP);
+
+        hsaKernelInfo->private_mem_size =
+            roundUp(getSize(Brig::BRIG_SEGMENT_PRIVATE), 8);
+
+        hsaKernelInfo->spill_mem_size =
+            roundUp(getSize(Brig::BRIG_SEGMENT_SPILL), 8);
+    }
+
+    int
+    getSize(MemorySegment segment) const
+    {
+        Brig::BrigSegment brigSeg;
+
+        switch (segment) {
+          case MemorySegment::NONE:
+            brigSeg = Brig::BRIG_SEGMENT_NONE;
+            break;
+          case MemorySegment::FLAT:
+            brigSeg = Brig::BRIG_SEGMENT_FLAT;
+            break;
+          case MemorySegment::GLOBAL:
+            brigSeg = Brig::BRIG_SEGMENT_GLOBAL;
+            break;
+          case MemorySegment::READONLY:
+            brigSeg = Brig::BRIG_SEGMENT_READONLY;
+            break;
+          case MemorySegment::KERNARG:
+            brigSeg = Brig::BRIG_SEGMENT_KERNARG;
+            break;
+          case MemorySegment::GROUP:
+            brigSeg = Brig::BRIG_SEGMENT_GROUP;
+            break;
+          case MemorySegment::PRIVATE:
+            brigSeg = Brig::BRIG_SEGMENT_PRIVATE;
+            break;
+          case MemorySegment::SPILL:
+            brigSeg = Brig::BRIG_SEGMENT_SPILL;
+            break;
+          case MemorySegment::ARG:
+            brigSeg = Brig::BRIG_SEGMENT_ARG;
+            break;
+          case MemorySegment::EXTSPACE0:
+            brigSeg = Brig::BRIG_SEGMENT_AMD_GCN;
+            break;
+          default:
+            fatal("Unknown BrigSegment type.\n");
+        }
+
+        return getSize(brigSeg);
+    }
+
+  private:
+    int
+    getSize(Brig::BrigSegment segment) const
+    {
+        if (segment == Brig::BRIG_SEGMENT_PRIVATE) {
+            // with the code generated by new HSA compiler the assertion
+            // does not hold anymore..
+            //assert(private_size != -1);
+            return private_size;
+        } else {
+            return storageMap->getSize(segment);
+        }
+    }
+
+  public:
+    StorageElement*
+    findSymbol(Brig::BrigSegment segment, uint64_t addr)
+    {
+        return storageMap->findSymbol(segment, addr);
+    }
+
+    void
+    setPrivateSize(int32_t _private_size)
+    {
+        private_size = _private_size;
+    }
+
+    Label*
+    refLabel(const Brig::BrigDirectiveLabel *lbl, const BrigObject *obj)
+    {
+        return labelMap.refLabel(lbl, obj);
+    }
+};
+
+#endif // __HSAIL_CODE_HH__
diff --git a/src/gpu-compute/kernel_cfg.cc b/src/gpu-compute/kernel_cfg.cc
new file mode 100644
index 000000000..7e0e10912
--- /dev/null
+++ b/src/gpu-compute/kernel_cfg.cc
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "gpu-compute/kernel_cfg.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <string>
+
+#include "gpu-compute/gpu_static_inst.hh"
+
+void
+ControlFlowInfo::assignImmediatePostDominators(
+        const std::vector<GPUStaticInst*>& instructions)
+{
+    ControlFlowInfo cfg(instructions);
+    cfg.findImmediatePostDominators();
+}
+
+
+ControlFlowInfo::ControlFlowInfo(const std::vector<GPUStaticInst*>& insts) :
+        instructions(insts)
+{
+    createBasicBlocks();
+    connectBasicBlocks();
+}
+
+BasicBlock*
+ControlFlowInfo::basicBlock(int inst_num) const {
+    for (auto& block: basicBlocks) {
+       int first_block_id = block->firstInstruction->instNum();
+       if (inst_num >= first_block_id &&
+               inst_num < first_block_id + block->size) {
+           return block.get();
+       }
+    }
+    return nullptr;
+}
+
+
+GPUStaticInst*
+ControlFlowInfo::lastInstruction(const BasicBlock* block) const
+{
+    if (block->isExit()) {
+        return nullptr;
+    }
+
+    return instructions.at(block->firstInstruction->instNum() +
+                           block->size - 1);
+}
+
+BasicBlock*
+ControlFlowInfo::postDominator(const BasicBlock* block) const
+{
+    if (block->isExit()) {
+        return nullptr;
+    }
+    return basicBlock(lastInstruction(block)->ipdInstNum());
+}
+
+void
+ControlFlowInfo::createBasicBlocks()
+{
+    assert(!instructions.empty());
+    std::set<int> leaders;
+    // first instruction is a leader
+    leaders.insert(0);
+    for (int i = 1; i < instructions.size(); i++) {
+        GPUStaticInst* instruction = instructions[i];
+        if (instruction->o_type == Enums::OT_BRANCH) {
+            const int target_pc = instruction->getTargetPc();
+            leaders.insert(target_pc);
+            leaders.insert(i + 1);
+        }
+    }
+
+    size_t block_size = 0;
+    for (int i = 0; i < instructions.size(); i++) {
+        if (leaders.find(i) != leaders.end()) {
+            uint32_t id = basicBlocks.size();
+            if (id > 0) {
+                basicBlocks.back()->size = block_size;
+            }
+            block_size = 0;
+            basicBlocks.emplace_back(new BasicBlock(id, instructions[i]));
+        }
+        block_size++;
+    }
+    basicBlocks.back()->size = block_size;
+    // exit basic block
+    basicBlocks.emplace_back(new BasicBlock(basicBlocks.size(), nullptr));
+}
+
+void
+ControlFlowInfo::connectBasicBlocks()
+{
+    BasicBlock* exit_bb = basicBlocks.back().get();
+    for (auto& bb : basicBlocks) {
+        if (bb->isExit()) {
+            break;
+        }
+        GPUStaticInst* last = lastInstruction(bb.get());
+        if (last->o_type == Enums::OT_RET) {
+            bb->successorIds.insert(exit_bb->id);
+            break;
+        }
+        if (last->o_type == Enums::OT_BRANCH) {
+            const uint32_t target_pc = last->getTargetPc();
+            BasicBlock* target_bb = basicBlock(target_pc);
+            bb->successorIds.insert(target_bb->id);
+        }
+
+        // Unconditional jump instructions have a unique successor
+        if (!last->unconditionalJumpInstruction()) {
+            BasicBlock* next_bb = basicBlock(last->instNum() + 1);
+            bb->successorIds.insert(next_bb->id);
+        }
+    }
+}
+
+
+// In-place set intersection
+static void
+intersect(std::set<uint32_t>& a, const std::set<uint32_t>& b)
+{
+    std::set<uint32_t>::iterator it = a.begin();
+    while (it != a.end()) {
+        it = b.find(*it) != b.end() ? ++it : a.erase(it);
+    }
+}
+
+
+void
+ControlFlowInfo::findPostDominators()
+{
+    // the only postdominator of the exit block is itself
+    basicBlocks.back()->postDominatorIds.insert(basicBlocks.back()->id);
+    //copy all basic blocks to all postdominator lists except for exit block
+    for (auto& block : basicBlocks) {
+        if (!block->isExit()) {
+            for (uint32_t i = 0; i < basicBlocks.size(); i++) {
+                block->postDominatorIds.insert(i);
+            }
+        }
+    }
+
+    bool change = true;
+    while (change) {
+        change = false;
+        for (int h = basicBlocks.size() - 2; h >= 0; --h) {
+            size_t num_postdominators =
+                    basicBlocks[h]->postDominatorIds.size();
+            for (int s : basicBlocks[h]->successorIds) {
+                intersect(basicBlocks[h]->postDominatorIds,
+                          basicBlocks[s]->postDominatorIds);
+            }
+            basicBlocks[h]->postDominatorIds.insert(h);
+            change |= (num_postdominators
+                    != basicBlocks[h]->postDominatorIds.size());
+        }
+    }
+}
+
+
+// In-place set difference
+static void
+setDifference(std::set<uint32_t>&a,
+           const std::set<uint32_t>& b, uint32_t exception)
+{
+    for (uint32_t b_elem : b) {
+        if (b_elem != exception) {
+            a.erase(b_elem);
+        }
+    }
+}
+
+void
+ControlFlowInfo::findImmediatePostDominators()
+{
+    assert(basicBlocks.size() > 1); // Entry and exit blocks must be present
+
+    findPostDominators();
+
+    for (auto& basicBlock : basicBlocks) {
+        if (basicBlock->isExit()) {
+            continue;
+        }
+        std::set<uint32_t> candidates = basicBlock->postDominatorIds;
+        candidates.erase(basicBlock->id);
+        for (uint32_t postDominatorId : basicBlock->postDominatorIds) {
+            if (postDominatorId != basicBlock->id) {
+                setDifference(candidates,
+                           basicBlocks[postDominatorId]->postDominatorIds,
+                           postDominatorId);
+            }
+        }
+        assert(candidates.size() == 1);
+        GPUStaticInst* last_instruction = lastInstruction(basicBlock.get());
+        BasicBlock* ipd_block = basicBlocks[*(candidates.begin())].get();
+        if (!ipd_block->isExit()) {
+            GPUStaticInst* ipd_first_inst = ipd_block->firstInstruction;
+            last_instruction->ipdInstNum(ipd_first_inst->instNum());
+        } else {
+            last_instruction->ipdInstNum(last_instruction->instNum() + 1);
+        }
+    }
+}
+
+void
+ControlFlowInfo::printPostDominators() const
+{
+    for (auto& block : basicBlocks) {
+        std::cout << "PD(" << block->id << ") = {";
+        std::copy(block->postDominatorIds.begin(),
+                  block->postDominatorIds.end(),
+                  std::ostream_iterator<uint32_t>(std::cout, ", "));
+        std::cout << "}" << std::endl;
+    }
+}
+
+void
+ControlFlowInfo::printImmediatePostDominators() const
+{
+    for (const auto& block : basicBlocks) {
+        if (block->isExit()) {
+            continue;
+        }
+        std::cout << "IPD(" << block->id << ") = ";
+        std::cout << postDominator(block.get())->id << ", ";
+    }
+    std::cout << std::endl;
+}
+void
+ControlFlowInfo::printBasicBlocks() const
+{
+    for (GPUStaticInst* inst : instructions) {
+        int inst_num = inst->instNum();
+        std::cout << inst_num << " [" << basicBlock(inst_num)->id
+                << "]: " << inst->disassemble();
+        if (inst->o_type == Enums::OT_BRANCH) {
+            std::cout << ", PC = " << inst->getTargetPc();
+        }
+        std::cout << std::endl;
+    }
+}
+
+void
+ControlFlowInfo::printBasicBlockDot() const
+{
+    printf("digraph {\n");
+    for (const auto& basic_block : basicBlocks) {
+        printf("\t");
+        for (uint32_t successorId : basic_block->successorIds) {
+            printf("%d -> %d; ", basic_block->id, successorId);
+        }
+        printf("\n");
+    }
+    printf("}\n");
+}
diff --git a/src/gpu-compute/kernel_cfg.hh b/src/gpu-compute/kernel_cfg.hh
new file mode 100644
index 000000000..74ea861d8
--- /dev/null
+++ b/src/gpu-compute/kernel_cfg.hh
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __KERNEL_CFG_HH__
+#define __KERNEL_CFG_HH__
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <set>
+#include <vector>
+
+
+class GPUStaticInst;
+class HsailCode;
+
+struct BasicBlock
+{
+    BasicBlock(uint32_t num, GPUStaticInst* begin) :
+            id(num), size(0), firstInstruction(begin)
+    {
+    }
+
+    bool
+    isEntry() const
+    {
+        return !id;
+    }
+
+    bool
+    isExit() const
+    {
+        return !size;
+    }
+
+    /**
+     * Unique identifier for the block within a given kernel.
+     */
+    const uint32_t id;
+
+    /**
+     * Number of instructions contained in the block
+     */
+    size_t size;
+
+    /**
+     * Pointer to first instruction of the block.
+     */
+    GPUStaticInst* firstInstruction;
+
+    /**
+     * Identifiers of the blocks that follow (are reachable from) this block.
+     */
+    std::set<uint32_t> successorIds;
+
+    /**
+     * Identifiers of the blocks that will be visited from this block.
+     */
+    std::set<uint32_t> postDominatorIds;
+};
+
+class ControlFlowInfo
+{
+public:
+
+    /**
+     * Compute immediate post-dominator instruction for kernel instructions.
+     */
+    static void assignImmediatePostDominators(
+            const std::vector<GPUStaticInst*>& instructions);
+
+private:
+    ControlFlowInfo(const std::vector<GPUStaticInst*>& instructions);
+
+    GPUStaticInst* lastInstruction(const BasicBlock* block) const;
+
+    BasicBlock* basicBlock(int inst_num) const;
+
+    BasicBlock* postDominator(const BasicBlock* block) const;
+
+    void createBasicBlocks();
+
+    void connectBasicBlocks();
+
+    void findPostDominators();
+
+    void findImmediatePostDominators();
+
+    void printBasicBlocks() const;
+
+    void printBasicBlockDot() const;
+
+    void printPostDominators() const;
+
+    void printImmediatePostDominators() const;
+
+    std::vector<std::unique_ptr<BasicBlock>> basicBlocks;
+    std::vector<GPUStaticInst*> instructions;
+};
+
+#endif // __KERNEL_CFG_HH__
diff --git a/src/gpu-compute/lds_state.cc b/src/gpu-compute/lds_state.cc
new file mode 100644
index 000000000..91ee8009a
--- /dev/null
+++ b/src/gpu-compute/lds_state.cc
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Joe Gross
+ */
+
+#include "gpu-compute/lds_state.hh"
+
+#include <array>
+#include <cstdio>
+#include <cstdlib>
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+
+/**
+ * the default constructor that works with SWIG
+ */
+LdsState::LdsState(const Params *params) :
+    MemObject(params),
+    tickEvent(this),
+    cuPort(name() + ".port", this),
+    maximumSize(params->size),
+    range(params->range),
+    bankConflictPenalty(params->bankConflictPenalty),
+    banks(params->banks)
+{
+    fatal_if(params->banks <= 0,
+             "Number of LDS banks should be positive number");
+    fatal_if((params->banks & (params->banks - 1)) != 0,
+             "Number of LDS banks should be a power of 2");
+    fatal_if(params->size <= 0,
+             "cannot allocate an LDS with a size less than 1");
+    fatal_if(params->size % 2,
+          "the LDS should be an even number");
+}
+
+/**
+ * Needed by the SWIG compiler
+ */
+LdsState *
+LdsStateParams::create()
+{
+    return new LdsState(this);
+}
+
+/**
+ * set the parent and name based on the parent
+ */
+void
+LdsState::setParent(ComputeUnit *x_parent)
+{
+    // check that this gets assigned to the same thing each time
+    fatal_if(!x_parent, "x_parent should not be nullptr");
+    fatal_if(x_parent == parent,
+             "should not be setting the parent twice");
+
+    parent = x_parent;
+    _name = x_parent->name() + ".LdsState";
+}
+
+/**
+ * derive the gpu mem packet from the packet and then count the bank conflicts
+ */
+unsigned
+LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses)
+{
+    Packet::SenderState *baseSenderState = packet->senderState;
+    while (baseSenderState->predecessor) {
+        baseSenderState = baseSenderState->predecessor;
+    }
+    const ComputeUnit::LDSPort::SenderState *senderState =
+            dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState);
+
+    fatal_if(!senderState,
+             "did not get the right sort of sender state");
+
+    GPUDynInstPtr gpuDynInst = senderState->getMemInst();
+
+    return countBankConflicts(gpuDynInst, bankAccesses);
+}
+
+// Count the total number of bank conflicts for the local memory packet
+unsigned
+LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst,
+                             unsigned *numBankAccesses)
+{
+    int bank_conflicts = 0;
+    std::vector<int> bank;
+    // the number of LDS banks being touched by the memory instruction
+    int numBanks = std::min(parent->wfSize(), banks);
+    // if the wavefront size is larger than the number of LDS banks, we
+    // need to iterate over all work items to calculate the total
+    // number of bank conflicts
+    int groups = (parent->wfSize() > numBanks) ?
+        (parent->wfSize() / numBanks) : 1;
+    for (int i = 0; i < groups; i++) {
+        // Address Array holding all the work item addresses of an instruction
+        std::vector<Addr> addr_array;
+        addr_array.resize(numBanks, 0);
+        bank.clear();
+        bank.resize(banks, 0);
+        int max_bank = 0;
+
+        // populate the address array for all active work items
+        for (int j = 0; j < numBanks; j++) {
+            if (gpuDynInst->exec_mask[(i*numBanks)+j]) {
+                addr_array[j] = gpuDynInst->addr[(i*numBanks)+j];
+            } else {
+                addr_array[j] = std::numeric_limits<Addr>::max();
+            }
+        }
+
+        if (gpuDynInst->m_op == Enums::MO_LD ||
+            gpuDynInst->m_op == Enums::MO_ST) {
+            // mask identical addresses
+            for (int j = 0; j < numBanks; ++j) {
+                for (int j0 = 0; j0 < j; j0++) {
+                    if (addr_array[j] != std::numeric_limits<Addr>::max()
+                                    && addr_array[j] == addr_array[j0]) {
+                        addr_array[j] = std::numeric_limits<Addr>::max();
+                    }
+                }
+            }
+        }
+        // calculate bank conflicts
+        for (int j = 0; j < numBanks; ++j) {
+            if (addr_array[j] != std::numeric_limits<Addr>::max()) {
+                int bankId = addr_array[j] % banks;
+                bank[bankId]++;
+                max_bank = std::max(max_bank, bank[bankId]);
+                // Count the number of LDS banks accessed.
+                // Since we have masked identical addresses all remaining
+                // accesses will need to be serialized if they access
+                // the same bank (bank conflict).
+                (*numBankAccesses)++;
+            }
+        }
+        bank_conflicts += max_bank;
+    }
+    panic_if(bank_conflicts > parent->wfSize(),
+             "Max bank conflicts should match num of work items per instr");
+    return bank_conflicts;
+}
+
+/**
+ * receive the packet from the CU
+ */
+bool
+LdsState::CuSidePort::recvTimingReq(PacketPtr packet)
+{
+    return ownerLds->processPacket(packet);
+}
+
+GPUDynInstPtr
+LdsState::getDynInstr(PacketPtr packet)
+{
+    ComputeUnit::LDSPort::SenderState *ss =
+        dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
+                     packet->senderState);
+    return ss->getMemInst();
+}
+
+/**
+ * process an incoming packet, add it to the return queue
+ */
+bool
+LdsState::processPacket(PacketPtr packet)
+{
+    unsigned bankAccesses = 0;
+    // the number of conflicts this packet will have when accessing the LDS
+    unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
+    // count the total number of physical LDS bank accessed
+    parent->ldsBankAccesses += bankAccesses;
+    // count the LDS bank conflicts. A number set to 1 indicates one
+    // access per bank maximum so there are no bank conflicts
+    parent->ldsBankConflictDist.sample(bankConflicts-1);
+
+    GPUDynInstPtr dynInst = getDynInstr(packet);
+    // account for the LDS bank conflict overhead
+    int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() :
+        (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() :
+        parent->loadBusLength();
+    // delay for accessing the LDS
+    Tick processingTime =
+        parent->shader->ticks(bankConflicts * bankConflictPenalty) +
+        parent->shader->ticks(busLength);
+    // choose (delay + last packet in queue) or (now + delay) as the time to
+    // return this
+    Tick doneAt = earliestReturnTime() + processingTime;
+    // then store it for processing
+    return returnQueuePush(std::make_pair(doneAt, packet));
+}
+
+/**
+ * add this to the queue of packets to be returned
+ */
+bool
+LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair)
+{
+    // TODO add time limits (e.g. one packet per cycle) and queue size limits
+    // and implement flow control
+    returnQueue.push(thePair);
+
+    // if there is no set wakeup time, look through the queue
+    if (!tickEvent.scheduled()) {
+        process();
+    }
+
+    return true;
+}
+
+/**
+ * receive a packet in functional mode
+ */
+void
+LdsState::CuSidePort::recvFunctional(PacketPtr pkt)
+{
+    fatal("not implemented");
+}
+
+/**
+ * receive a retry for a response
+ */
+void
+LdsState::CuSidePort::recvRespRetry()
+{
+    // TODO verify that this is the right way to do this
+    assert(ownerLds->isRetryResp());
+    ownerLds->setRetryResp(false);
+    ownerLds->process();
+}
+
+/**
+ * receive a retry
+ */
+void
+LdsState::CuSidePort::recvRetry()
+{
+    fatal("not implemented");
+}
+
+/**
+ * look for packets to return at this time
+ */
+bool
+LdsState::process()
+{
+    Tick now = clockEdge();
+
+    // send back completed packets
+    while (!returnQueue.empty() && returnQueue.front().first <= now) {
+        PacketPtr packet = returnQueue.front().second;
+
+        ComputeUnit::LDSPort::SenderState *ss =
+            dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
+                            packet->senderState);
+
+        GPUDynInstPtr gpuDynInst = ss->getMemInst();
+
+        gpuDynInst->initiateAcc(gpuDynInst);
+
+        packet->makeTimingResponse();
+
+        returnQueue.pop();
+
+        bool success = cuPort.sendTimingResp(packet);
+
+        if (!success) {
+            retryResp = true;
+            panic("have not handled timing responses being NACK'd when sent"
+                            "back");
+        }
+    }
+
+    // determine the next wakeup time
+    if (!returnQueue.empty()) {
+
+        Tick next = returnQueue.front().first;
+
+        if (tickEvent.scheduled()) {
+
+            if (next < tickEvent.when()) {
+
+                tickEvent.deschedule();
+                tickEvent.schedule(next);
+            }
+        } else {
+            tickEvent.schedule(next);
+        }
+    }
+
+    return true;
+}
+
+/**
+ * wake up at this time and perform specified actions
+ */
+void
+LdsState::TickEvent::process()
+{
+    ldsState->process();
+}
+
+/**
+ *
+ */
+void
+LdsState::regStats()
+{
+}
diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh
new file mode 100644
index 000000000..89f08a1d3
--- /dev/null
+++ b/src/gpu-compute/lds_state.hh
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Joe Gross
+ */
+
+#ifndef __LDS_STATE_HH__
+#define __LDS_STATE_HH__
+
+#include <array>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "enums/MemOpType.hh"
+#include "enums/MemType.hh"
+#include "gpu-compute/misc.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+#include "params/LdsState.hh"
+
+class ComputeUnit;
+
+/**
+ * this represents a slice of the overall LDS, intended to be associated with an
+ * individual workgroup
+ */
+class LdsChunk
+{
+  public:
+    LdsChunk(const uint32_t x_size):
+        chunk(x_size)
+    {
+    }
+
+    LdsChunk() {}
+
+    /**
+     * a read operation
+     */
+    template<class T>
+    T
+    read(const uint32_t index)
+    {
+        fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0");
+        fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
+        T *p0 = (T *) (&(chunk.at(index)));
+        return *p0;
+    }
+
+    /**
+     * a write operation
+     */
+    template<class T>
+    void
+    write(const uint32_t index, const T value)
+    {
+        fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0");
+        fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
+        T *p0 = (T *) (&(chunk.at(index)));
+        *p0 = value;
+    }
+
+    /**
+     * get the size of this chunk
+     */
+    std::vector<uint8_t>::size_type
+    size() const
+    {
+        return chunk.size();
+    }
+
+  protected:
+    // the actual data store for this slice of the LDS
+    std::vector<uint8_t> chunk;
+};
+
+// Local Data Share (LDS) State per Wavefront (contents of the LDS region
+// allocated to the WorkGroup of this Wavefront)
+class LdsState: public MemObject
+{
+  protected:
+
+    /**
+     * an event to allow event-driven execution
+     */
+    class TickEvent: public Event
+    {
+      protected:
+
+        LdsState *ldsState = nullptr;
+
+        Tick nextTick = 0;
+
+      public:
+
+        TickEvent(LdsState *_ldsState) :
+            ldsState(_ldsState)
+        {
+        }
+
+        virtual void
+        process();
+
+        void
+        schedule(Tick when)
+        {
+            mainEventQueue[0]->schedule(this, when);
+        }
+
+        void
+        deschedule()
+        {
+            mainEventQueue[0]->deschedule(this);
+        }
+    };
+
+    /**
+     * CuSidePort is the LDS Port closer to the CU side
+     */
+    class CuSidePort: public SlavePort
+    {
+      public:
+        CuSidePort(const std::string &_name, LdsState *_ownerLds) :
+                SlavePort(_name, _ownerLds), ownerLds(_ownerLds)
+        {
+        }
+
+      protected:
+        LdsState *ownerLds;
+
+        virtual bool
+        recvTimingReq(PacketPtr pkt);
+
+        virtual Tick
+        recvAtomic(PacketPtr pkt)
+        {
+          return 0;
+        }
+
+        virtual void
+        recvFunctional(PacketPtr pkt);
+
+        virtual void
+        recvRangeChange()
+        {
+        }
+
+        virtual void
+        recvRetry();
+
+        virtual void
+        recvRespRetry();
+
+        virtual AddrRangeList
+        getAddrRanges() const
+        {
+          AddrRangeList ranges;
+          ranges.push_back(ownerLds->getAddrRange());
+          return ranges;
+        }
+
+        template<typename T>
+        void
+        loadData(PacketPtr packet);
+
+        template<typename T>
+        void
+        storeData(PacketPtr packet);
+
+        template<typename T>
+        void
+        atomicOperation(PacketPtr packet);
+    };
+
+  protected:
+
+    // the lds reference counter
+    // The key is the workgroup ID and dispatch ID
+    // The value is the number of wavefronts that reference this LDS, as
+    // wavefronts are launched, the counter goes up for that workgroup and when
+    // they return it decreases, once it reaches 0 then this chunk of the LDS is
+    // returned to the available pool. However,it is deallocated on the 1->0
+    // transition, not whenever the counter is 0 as it always starts with 0 when
+    // the workgroup asks for space
+    std::unordered_map<uint32_t,
+                       std::unordered_map<uint32_t, int32_t>> refCounter;
+
+    // the map that allows workgroups to access their own chunk of the LDS
+    std::unordered_map<uint32_t,
+                       std::unordered_map<uint32_t, LdsChunk>> chunkMap;
+
+    // an event to allow the LDS to wake up at a specified time
+    TickEvent tickEvent;
+
+    // the queue of packets that are going back to the CU after a
+    // read/write/atomic op
+    // TODO need to make this have a maximum size to create flow control
+    std::queue<std::pair<Tick, PacketPtr>> returnQueue;
+
+    // whether or not there are pending responses
+    bool retryResp = false;
+
+    bool
+    process();
+
+    GPUDynInstPtr
+    getDynInstr(PacketPtr packet);
+
+    bool
+    processPacket(PacketPtr packet);
+
+    unsigned
+    countBankConflicts(PacketPtr packet, unsigned *bankAccesses);
+
+    unsigned
+    countBankConflicts(GPUDynInstPtr gpuDynInst,
+                       unsigned *numBankAccesses);
+
+  public:
+    typedef LdsStateParams Params;
+
+    LdsState(const Params *params);
+
+    // prevent copy construction
+    LdsState(const LdsState&) = delete;
+
+    ~LdsState()
+    {
+        parent = nullptr;
+    }
+
+    const Params *
+    params() const
+    {
+        return dynamic_cast<const Params *>(_params);
+    }
+
+    bool
+    isRetryResp() const
+    {
+        return retryResp;
+    }
+
+    void
+    setRetryResp(const bool value)
+    {
+        retryResp = value;
+    }
+
+    // prevent assignment
+    LdsState &
+    operator=(const LdsState &) = delete;
+
+    /**
+     * use the dynamic wave id to create or just increase the reference count
+     */
+    int
+    increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
+    {
+        int refCount = getRefCounter(dispatchId, wgId);
+        fatal_if(refCount < 0,
+                 "reference count should not be below zero");
+        return ++refCounter[dispatchId][wgId];
+    }
+
+    /**
+     * decrease the reference count after making sure it is in the list
+     * give back this chunk if the ref counter has reached 0
+     */
+    int
+    decreaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
+    {
+      int refCount = getRefCounter(dispatchId, wgId);
+
+      fatal_if(refCount <= 0,
+              "reference count should not be below zero or at zero to"
+              "decrement");
+
+      refCounter[dispatchId][wgId]--;
+
+      if (refCounter[dispatchId][wgId] == 0) {
+        releaseSpace(dispatchId, wgId);
+        return 0;
+      } else {
+        return refCounter[dispatchId][wgId];
+      }
+    }
+
+    /**
+     * return the current reference count for this workgroup id
+     */
+    int
+    getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
+    {
+      auto dispatchIter = chunkMap.find(dispatchId);
+      fatal_if(dispatchIter == chunkMap.end(),
+               "could not locate this dispatch id [%d]", dispatchId);
+
+      auto workgroup = dispatchIter->second.find(wgId);
+      fatal_if(workgroup == dispatchIter->second.end(),
+               "could not find this workgroup id within this dispatch id"
+               " did[%d] wgid[%d]", dispatchId, wgId);
+
+      auto refCountIter = refCounter.find(dispatchId);
+      if (refCountIter == refCounter.end()) {
+        fatal("could not locate this dispatch id [%d]", dispatchId);
+      } else {
+        auto workgroup = refCountIter->second.find(wgId);
+        if (workgroup == refCountIter->second.end()) {
+          fatal("could not find this workgroup id within this dispatch id"
+                  " did[%d] wgid[%d]", dispatchId, wgId);
+        } else {
+          return refCounter.at(dispatchId).at(wgId);
+        }
+      }
+
+      fatal("should not reach this point");
+      return 0;
+    }
+
+    /**
+     * assign a parent and request this amount of space be set aside
+     * for this wgid
+     */
+    LdsChunk *
+    reserveSpace(const uint32_t dispatchId, const uint32_t wgId,
+            const uint32_t size)
+    {
+        if (chunkMap.find(dispatchId) != chunkMap.end()) {
+            fatal_if(
+                chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
+                "duplicate workgroup ID asking for space in the LDS "
+                "did[%d] wgid[%d]", dispatchId, wgId);
+        }
+
+        fatal_if(bytesAllocated + size > maximumSize,
+                 "request would ask for more space than is available");
+
+        bytesAllocated += size;
+
+        chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
+        // make an entry for this workgroup
+        refCounter[dispatchId][wgId] = 0;
+
+        return &chunkMap[dispatchId][wgId];
+    }
+
+    bool
+    returnQueuePush(std::pair<Tick, PacketPtr> thePair);
+
+    Tick
+    earliestReturnTime() const
+    {
+        // TODO set to max(lastCommand+1, curTick())
+        return returnQueue.empty() ? curTick() : returnQueue.back().first;
+    }
+
+    void
+    setParent(ComputeUnit *x_parent);
+
+    void
+    regStats();
+
+    // accessors
+    ComputeUnit *
+    getParent() const
+    {
+        return parent;
+    }
+
+    std::string
+    getName()
+    {
+        return _name;
+    }
+
+    int
+    getBanks() const
+    {
+        return banks;
+    }
+
+    ComputeUnit *
+    getComputeUnit() const
+    {
+        return parent;
+    }
+
+    int
+    getBankConflictPenalty() const
+    {
+        return bankConflictPenalty;
+    }
+
+    /**
+     * get the allocated size for this workgroup
+     */
+    std::size_t
+    ldsSize(const uint32_t x_wgId)
+    {
+        return chunkMap[x_wgId].size();
+    }
+
+    AddrRange
+    getAddrRange() const
+    {
+        return range;
+    }
+
+    virtual BaseSlavePort &
+    getSlavePort(const std::string& if_name, PortID idx)
+    {
+        if (if_name == "cuPort") {
+            // TODO need to set name dynamically at this point?
+            return cuPort;
+        } else {
+            fatal("cannot resolve the port name " + if_name);
+        }
+    }
+
+    /**
+     * can this much space be reserved for a workgroup?
+     */
+    bool
+    canReserve(uint32_t x_size) const
+    {
+      return bytesAllocated + x_size <= maximumSize;
+    }
+
+  private:
+    /**
+     * give back the space
+     */
+    bool
+    releaseSpace(const uint32_t x_dispatchId, const uint32_t x_wgId)
+    {
+        auto dispatchIter = chunkMap.find(x_dispatchId);
+
+        if (dispatchIter == chunkMap.end()) {
+          fatal("dispatch id not found [%d]", x_dispatchId);
+        } else {
+          auto workgroupIter = dispatchIter->second.find(x_wgId);
+          if (workgroupIter == dispatchIter->second.end()) {
+            fatal("workgroup id [%d] not found in dispatch id [%d]",
+                    x_wgId, x_dispatchId);
+          }
+        }
+
+        fatal_if(bytesAllocated < chunkMap[x_dispatchId][x_wgId].size(),
+                 "releasing more space than was allocated");
+
+        bytesAllocated -= chunkMap[x_dispatchId][x_wgId].size();
+        chunkMap[x_dispatchId].erase(chunkMap[x_dispatchId].find(x_wgId));
+        return true;
+    }
+
+    // the port that connects this LDS to its owner CU
+    CuSidePort cuPort;
+
+    ComputeUnit* parent = nullptr;
+
+    std::string _name;
+
+    // the number of bytes currently reserved by all workgroups
+    int bytesAllocated = 0;
+
+    // the size of the LDS, the most bytes available
+    int maximumSize;
+
+    // Address range of this memory
+    AddrRange range;
+
+    // the penalty, in cycles, for each LDS bank conflict
+    int bankConflictPenalty = 0;
+
+    // the number of banks in the LDS underlying data store
+    int banks = 0;
+};
+
+#endif // __LDS_STATE_HH__
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc
new file mode 100644
index 000000000..7f919c5f4
--- /dev/null
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/local_memory_pipeline.hh"
+
+#include "debug/GPUPort.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p) :
+    computeUnit(nullptr), lmQueueSize(p->local_mem_queue_size)
+{
+}
+
+void
+LocalMemPipeline::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    _name = computeUnit->name() + ".LocalMemPipeline";
+}
+
+void
+LocalMemPipeline::exec()
+{
+    // apply any returned shared (LDS) memory operations
+    GPUDynInstPtr m = !lmReturnedRequests.empty() ?
+        lmReturnedRequests.front() : nullptr;
+
+    bool accessVrf = true;
+    if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) {
+        Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+        accessVrf =
+            w->computeUnit->vrf[m->simdId]->
+            vrfOperandAccessReady(m->seqNum(), w, m,
+                                  VrfAccessType::WRITE);
+    }
+
+    if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
+        computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return
+                 || computeUnit->wfWait.at(m->pipeId).rdy())) {
+        if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
+            doSmReturn<uint32_t, uint8_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
+            doSmReturn<uint32_t, uint16_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
+            doSmReturn<uint32_t, uint32_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
+            doSmReturn<int32_t, int8_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
+            doSmReturn<int32_t, int16_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
+            doSmReturn<int32_t, int32_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
+            doSmReturn<float, Float16>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
+            doSmReturn<float, float>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
+            doSmReturn<uint64_t, uint8_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
+            doSmReturn<uint64_t, uint16_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
+            doSmReturn<uint64_t, uint32_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
+            doSmReturn<uint64_t, uint64_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
+            doSmReturn<int64_t, int8_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
+            doSmReturn<int64_t, int16_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
+            doSmReturn<int64_t, int32_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
+            doSmReturn<int64_t, int64_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
+            doSmReturn<double, Float16>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
+            doSmReturn<double, float>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
+            doSmReturn<double, double>(m);
+    }
+
+    // If pipeline has executed a local memory instruction
+    // execute local memory packet and issue the packets
+    // to LDS
+    if (!lmIssuedRequests.empty() && lmReturnedRequests.size() < lmQueueSize) {
+
+        GPUDynInstPtr m = lmIssuedRequests.front();
+
+        bool returnVal = computeUnit->sendToLds(m);
+        if (!returnVal) {
+            DPRINTF(GPUPort, "packet was nack'd and put in retry queue");
+        }
+        lmIssuedRequests.pop();
+    }
+}
+
+template<typename c0, typename c1>
+void
+LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
+{
+    lmReturnedRequests.pop();
+    Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+    // Return data to registers
+    if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) {
+        std::vector<uint32_t> regVec;
+        for (int k = 0; k < m->n_reg; ++k) {
+            int dst = m->dst_reg+k;
+
+            if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
+                dst = m->dst_reg_vec[k];
+            // virtual->physical VGPR mapping
+            int physVgpr = w->remap(dst,sizeof(c0),1);
+            // save the physical VGPR index
+            regVec.push_back(physVgpr);
+            c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+
+            for (int i = 0; i < VSZ; ++i) {
+                if (m->exec_mask[i]) {
+                    // write the value into the physical VGPR. This is a purely
+                    // functional operation. No timing is modeled.
+                    w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
+                                                                *p1, i);
+                }
+                ++p1;
+            }
+        }
+
+        // Schedule the write operation of the load data on the VRF. This simply
+        // models the timing aspect of the VRF write operation. It does not
+        // modify the physical VGPR.
+        loadVrfBankConflictCycles +=
+            w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), w,
+                                                 regVec, sizeof(c0), m->time);
+    }
+
+    // Decrement outstanding request count
+    computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1);
+
+    if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op)
+        || MO_H(m->m_op)) {
+        computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_lm,
+                                         m->time, -1);
+    }
+
+    if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
+        computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_lm,
+                                         m->time, -1);
+    }
+
+    // Mark write bus busy for appropriate amount of time
+    computeUnit->locMemToVrfBus.set(m->time);
+    if (computeUnit->shader->coissue_return == 0)
+        w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+}
+
+void
+LocalMemPipeline::regStats()
+{
+    loadVrfBankConflictCycles
+        .name(name() + ".load_vrf_bank_conflict_cycles")
+        .desc("total number of cycles LDS data are delayed before updating "
+              "the VRF")
+        ;
+}
diff --git a/src/gpu-compute/local_memory_pipeline.hh b/src/gpu-compute/local_memory_pipeline.hh
new file mode 100644
index 000000000..a63d867d0
--- /dev/null
+++ b/src/gpu-compute/local_memory_pipeline.hh
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __LOCAL_MEMORY_PIPELINE_HH__
+#define __LOCAL_MEMORY_PIPELINE_HH__
+
+#include <queue>
+#include <string>
+
+#include "gpu-compute/misc.hh"
+#include "params/ComputeUnit.hh"
+#include "sim/stats.hh"
+
+/*
+ * @file local_memory_pipeline.hh
+ *
+ * The local memory pipeline issues newly created local memory packets
+ * from pipeline to the LDS. This stage also retires previously issued
+ * loads and stores that have returned from the LDS.
+ */
+
+class ComputeUnit;
+class Wavefront;
+
+class LocalMemPipeline
+{
+  public:
+    LocalMemPipeline(const ComputeUnitParams *params);
+    void init(ComputeUnit *cu);
+    void exec();
+
+    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr m);
+
+    std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; }
+    std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
+
+    bool
+    isLMRespFIFOWrRdy() const
+    {
+        return lmReturnedRequests.size() < lmQueueSize;
+    }
+
+    bool
+    isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
+    {
+        return (lmIssuedRequests.size() + pendReqs) < lmQueueSize;
+    }
+
+    const std::string& name() const { return _name; }
+    void regStats();
+
+  private:
+    ComputeUnit *computeUnit;
+    std::string _name;
+    int lmQueueSize;
+    Stats::Scalar loadVrfBankConflictCycles;
+    // Local Memory Request Fifo: all shared memory requests
+    // are issued to this FIFO from the memory pipelines
+    std::queue<GPUDynInstPtr> lmIssuedRequests;
+
+    // Local Memory Response Fifo: all responses of shared memory
+    // requests are sent to this FIFO from LDS
+    std::queue<GPUDynInstPtr> lmReturnedRequests;
+};
+
+#endif // __LOCAL_MEMORY_PIPELINE_HH__
diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh
new file mode 100644
index 000000000..4f8032832
--- /dev/null
+++ b/src/gpu-compute/misc.hh
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __MISC_HH__
+#define __MISC_HH__
+
+#include <bitset>
+#include <memory>
+
+#include "base/misc.hh"
+
+class GPUDynInst;
+
+// wavefront size of the machine
+static const int VSZ = 64;
+
+/*
+ This check is necessary because std::bitset only provides conversion to
+ unsigned long or unsigned long long via to_ulong() or to_ullong(). there are
+ a few places in the code where to_ullong() is used, however if VSZ is larger
+ than a value the host can support then bitset will throw a runtime exception.
+
+ we should remove all use of to_long() or to_ullong() so we can have VSZ
+ greater than 64b, however until that is done this assert is required.
+ */
+static_assert(VSZ <= sizeof(unsigned long long) * 8,
+              "VSZ is larger than the host can support");
+
+typedef std::bitset<VSZ> VectorMask;
+typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
+
+class WaitClass
+{
+  public:
+    WaitClass() : nxtAvail(0), lookAheadAvail(0), tcnt(0) { }
+    void init(uint64_t *_tcnt, uint32_t _numStages=0)
+    {
+        tcnt = _tcnt;
+        numStages = _numStages;
+    }
+
+    void set(uint32_t i)
+    {
+        fatal_if(nxtAvail > *tcnt,
+                 "Can't allocate resource because it is busy!!!");
+        nxtAvail = *tcnt + i;
+    }
+    void preset(uint32_t delay)
+    {
+        lookAheadAvail = std::max(lookAheadAvail, delay + (*tcnt) - numStages);
+    }
+    bool rdy() const { return *tcnt >= nxtAvail; }
+    bool prerdy() const { return *tcnt >= lookAheadAvail; }
+
+  private:
+    // timestamp indicating when resource will be available
+    uint64_t nxtAvail;
+    // timestamp indicating when resource will be available including
+    // pending uses of the resource (when there is a cycle gap between
+    // rdy() and set()
+    uint64_t lookAheadAvail;
+    // current timestamp
+    uint64_t *tcnt;
+    // number of stages between checking if a resource is ready and
+    // setting the resource's utilization
+    uint32_t numStages;
+};
+
+class Float16
+{
+  public:
+    uint16_t val;
+
+    Float16() { val = 0; }
+
+    Float16(const Float16 &x) : val(x.val) { }
+
+    Float16(float x)
+    {
+        uint32_t ai = *(uint32_t *)&x;
+
+        uint32_t s = (ai >> 31) & 0x1;
+        uint32_t exp = (ai >> 23) & 0xff;
+        uint32_t mant = (ai >> 0) & 0x7fffff;
+
+        if (exp == 0 || exp <= 0x70) {
+            exp = 0;
+            mant = 0;
+        } else if (exp == 0xff) {
+            exp = 0x1f;
+        } else if (exp >= 0x8f) {
+            exp = 0x1f;
+            mant = 0;
+        } else {
+            exp = exp - 0x7f + 0x0f;
+        }
+
+        mant = mant >> 13;
+
+        val = 0;
+        val |= (s << 15);
+        val |= (exp << 10);
+        val |= (mant << 0);
+    }
+
+    operator float() const
+    {
+        uint32_t s = (val >> 15) & 0x1;
+        uint32_t exp = (val >> 10) & 0x1f;
+        uint32_t mant = (val >> 0) & 0x3ff;
+
+        if (!exp) {
+            exp = 0;
+            mant = 0;
+        } else if (exp == 0x1f) {
+            exp = 0xff;
+        } else {
+            exp = exp - 0x0f + 0x7f;
+        }
+
+        uint32_t val1 = 0;
+        val1 |= (s << 31);
+        val1 |= (exp << 23);
+        val1 |= (mant << 13);
+
+        return *(float*)&val1;
+    }
+};
+
+#endif // __MISC_HH__
diff --git a/src/gpu-compute/ndrange.hh b/src/gpu-compute/ndrange.hh
new file mode 100644
index 000000000..d1ad35d4b
--- /dev/null
+++ b/src/gpu-compute/ndrange.hh
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __NDRANGE_HH__
+#define __NDRANGE_HH__
+
+#include "base/types.hh"
+#include "gpu-compute/qstruct.hh"
+
+struct NDRange
+{
+    // copy of the queue entry provided at dispatch
+    HsaQueueEntry q;
+
+    // The current workgroup id (3 dimensions)
+    int wgId[3];
+    // The number of workgroups in each dimension
+    int numWg[3];
+    // The total number of workgroups
+    int numWgTotal;
+
+    // The number of completed work groups
+    int numWgCompleted;
+    // The global workgroup ID
+    uint32_t globalWgId;
+
+    // flag indicating whether all work groups have been launched
+    bool wg_disp_rem;
+    // kernel complete
+    bool execDone;
+    bool userDoorBellSet;
+    volatile bool *addrToNotify;
+    volatile uint32_t *numDispLeft;
+    int dispatchId;
+    int curTid; // Current thread id
+};
+
+#endif // __NDRANGE_HH__
diff --git a/src/gpu-compute/of_scheduling_policy.cc b/src/gpu-compute/of_scheduling_policy.cc
new file mode 100644
index 000000000..7f114706a
--- /dev/null
+++ b/src/gpu-compute/of_scheduling_policy.cc
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/of_scheduling_policy.hh"
+
+#include "gpu-compute/wavefront.hh"
+
+Wavefront*
+OFSchedulingPolicy::chooseWave()
+{
+    // Set when policy choose a wave to schedule
+    bool waveChosen = false;
+    Wavefront *selectedWave = nullptr;
+    int selectedWaveID = -1;
+    uint32_t selectedPosition = 0;
+
+    for (int position = 0; position < scheduleList->size(); ++position) {
+        Wavefront *curWave = scheduleList->at(position);
+        uint32_t curWaveID = curWave->wfDynId;
+
+        // Choosed wave with the lowest wave ID
+        if (selectedWaveID == -1 || curWaveID < selectedWaveID) {
+            waveChosen = true;
+            selectedWaveID = curWaveID;
+            selectedWave = curWave;
+            selectedPosition = position;
+        }
+    }
+
+    // Check to make sure ready list had atleast one schedulable wave
+    if (waveChosen) {
+        scheduleList->erase(scheduleList->begin() + selectedPosition);
+    } else {
+        panic("Empty ready list");
+    }
+
+    return selectedWave;
+}
+
+void
+OFSchedulingPolicy::bindList(std::vector<Wavefront*> *list)
+{
+    scheduleList = list;
+}
diff --git a/src/gpu-compute/of_scheduling_policy.hh b/src/gpu-compute/of_scheduling_policy.hh
new file mode 100644
index 000000000..684e51a3a
--- /dev/null
+++ b/src/gpu-compute/of_scheduling_policy.hh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __OF_SCHEDULING_POLICY_HH__
+#define __OF_SCHEDULING_POLICY_HH__
+
+#include <cstddef>
+#include <vector>
+
+#include "base/misc.hh"
+
+class Wavefront;
+
+// Oldest First where age is marked by the wave id
+class OFSchedulingPolicy
+{
+  public:
+    OFSchedulingPolicy() : scheduleList(nullptr) { }
+
+    Wavefront* chooseWave();
+    void bindList(std::vector<Wavefront*> *list);
+
+  private:
+    // List of waves which are participating in scheduling.
+    // This scheduler selects the oldest wave from this list
+    std::vector<Wavefront*> *scheduleList;
+};
+
+#endif // __OF_SCHEDULING_POLICY_HH__
diff --git a/src/gpu-compute/pool_manager.cc b/src/gpu-compute/pool_manager.cc
new file mode 100644
index 000000000..b1bc6b1f3
--- /dev/null
+++ b/src/gpu-compute/pool_manager.cc
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/pool_manager.hh"
+
+PoolManager::PoolManager(uint32_t minAlloc, uint32_t poolSize)
+    : _minAllocation(minAlloc), _poolSize(poolSize)
+{
+    assert(poolSize > 0);
+}
diff --git a/src/gpu-compute/pool_manager.hh b/src/gpu-compute/pool_manager.hh
new file mode 100644
index 000000000..2cb53ce72
--- /dev/null
+++ b/src/gpu-compute/pool_manager.hh
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __POOL_MANAGER_HH__
+#define __POOL_MANAGER_HH__
+
+#include <cassert>
+#include <cstdint>
+#include <string>
+
+// Pool Manager Logic
+class PoolManager
+{
+  public:
+    PoolManager(uint32_t minAlloc, uint32_t poolSize);
+    uint32_t minAllocation() { return _minAllocation; }
+    virtual std::string printRegion() = 0;
+    virtual uint32_t regionSize(std::pair<uint32_t,uint32_t> &region) = 0;
+    virtual bool canAllocate(uint32_t numRegions, uint32_t size) = 0;
+
+    virtual uint32_t allocateRegion(const uint32_t size,
+                                    uint32_t *reserved) = 0;
+
+    virtual void freeRegion(uint32_t firstIdx, uint32_t lastIdx) = 0;
+    uint32_t poolSize() { return _poolSize; }
+
+  private:
+    // minimum size that can be reserved per allocation
+    uint32_t _minAllocation;
+    // pool size in number of elements
+    uint32_t _poolSize;
+};
+
+#endif // __POOL_MANAGER_HH__
diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh
new file mode 100644
index 000000000..092303c00
--- /dev/null
+++ b/src/gpu-compute/qstruct.hh
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Marc Orr
+ */
+
+#ifndef __Q_STRUCT_HH__
+#define __Q_STRUCT_HH__
+
+#include <bitset>
+#include <cstdint>
+
+// Maximum number of arguments
+static const int KER_NUM_ARGS = 32;
+// Kernel argument buffer size
+static const int KER_ARGS_LENGTH = 512;
+
+class LdsChunk;
+struct NDRange;
+
+// Be very careful of alignment in this structure. The structure
+// must compile to the same layout in both 32-bit and 64-bit mode.
+struct HsaQueueEntry
+{
+    // Base pointer for array of instruction pointers
+    uint64_t code_ptr;
+    // Grid Size (3 dimensions)
+    uint32_t gdSize[3];
+    // Workgroup Size (3 dimensions)
+    uint32_t wgSize[3];
+    uint16_t sRegCount;
+    uint16_t dRegCount;
+    uint16_t cRegCount;
+    uint64_t privMemStart;
+    uint32_t privMemPerItem;
+    uint32_t privMemTotal;
+    uint64_t spillMemStart;
+    uint32_t spillMemPerItem;
+    uint32_t spillMemTotal;
+    uint64_t roMemStart;
+    uint32_t roMemTotal;
+    // Size (in bytes) of LDS
+    uint32_t ldsSize;
+    // Virtual Memory Id (unused right now)
+    uint32_t vmId;
+
+    // Pointer to dependency chain (unused now)
+    uint64_t depends;
+
+    // pointer to bool
+    uint64_t addrToNotify;
+    // pointer to uint32_t
+    uint64_t numDispLeft;
+
+    // variables to pass arguments when running in standalone mode,
+    // will be removed when run.py and sh.cpp have been updated to
+    // use args and offset arrays
+    uint64_t arg1;
+    uint64_t arg2;
+    uint64_t arg3;
+    uint64_t arg4;
+
+    // variables to pass arguments when running in cpu+gpu mode
+    uint8_t args[KER_ARGS_LENGTH];
+    uint16_t offsets[KER_NUM_ARGS];
+    uint16_t num_args;
+};
+
+// State used to start (or restart) a WF
+struct WFContext
+{
+    // 32 bit values
+    // barrier state
+    int bar_cnt[VSZ];
+
+    // id (which WF in the WG)
+    int cnt;
+
+    // more barrier state
+    int max_bar_cnt;
+    int old_barrier_cnt;
+    int barrier_cnt;
+
+    // More Program Counter Stuff
+    uint32_t pc;
+
+    // Program counter of the immediate post-dominator instruction
+    uint32_t rpc;
+
+    // WG wide state (I don't see how to avoid redundancy here)
+    int cu_id;
+    uint32_t wg_id;
+    uint32_t barrier_id;
+
+    // 64 bit values (these values depend on the wavefront size)
+    // masks
+    uint64_t init_mask;
+    uint64_t exec_mask;
+
+    // private memory;
+    Addr privBase;
+    Addr spillBase;
+
+    LdsChunk *ldsChunk;
+
+    /*
+     * Kernel wide state
+     * This is a hack. This state should be moved through simulated memory
+     * during a yield. Though not much is being used here, so it's probably
+     * probably not a big deal.
+     *
+     * Just to add to this comment... The ndr is derived from simulated
+     * memory when the cl-runtime allocates an HsaQueueEntry and populates it
+     * for a kernel launch. So in theory the runtime should be able to keep
+     * that state around. Then a WF can reference it upon restart to derive
+     * kernel wide state. The runtime can deallocate the state when the
+     * kernel completes.
+     */
+    NDRange *ndr;
+};
+
+// State that needs to be passed between the simulation and simulated app, a
+// pointer to this struct can be passed through the depends field in the
+// HsaQueueEntry struct
+struct HostState
+{
+    // cl_event* has original HsaQueueEntry for init
+    uint64_t event;
+};
+
+// Total number of HSA queues
+static const int HSAQ_NQUEUES = 8;
+
+// These values will eventually live in memory mapped registers
+// and be settable by the kernel mode driver.
+
+// Number of entries in each HSA queue
+static const int HSAQ_SIZE = 64;
+// Address of first HSA queue index
+static const int HSAQ_INDX_BASE = 0x10000ll;
+// Address of first HSA queue
+static const int HSAQ_BASE = 0x11000ll;
+// Suggested start of HSA code
+static const int HSA_CODE_BASE = 0x18000ll;
+
+// These are shortcuts for deriving the address of a specific
+// HSA queue or queue index
+#define HSAQ(n) (HSAQ_BASE + HSAQ_SIZE * sizeof(struct fsaQueue) * n)
+#define HSAQE(n,i) (HSAQ_BASE + (HSAQ_SIZE * n + i) * sizeof(struct fsaQueue))
+#define HSAQ_RI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 0))
+#define HSAQ_WI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 1))
+#define HSAQ_CI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 2))
+
+/*
+ * Example code for writing to a queue
+ *
+ * void
+ * ToQueue(int n,struct fsaQueue *val)
+ * {
+ *     int wi = *(int*)HSAQ_WI(n);
+ *     int ri = *(int*)HSAQ_RI(n);
+ *     int ci = *(int*)HSAQ_CI(n);
+ *
+ *     if (ci - ri < HSAQ_SIZE) {
+ *         (*(int*)HSAQ_CI(n))++;
+ *         *(HsaQueueEntry*)(HSAQE(n, (wi % HSAQ_SIZE))) = *val;
+ *         (*(int*)HSAQ_WI(n))++;
+ *     }
+ * }
+ */
+
+#endif // __Q_STRUCT_HH__
diff --git a/src/gpu-compute/rr_scheduling_policy.cc b/src/gpu-compute/rr_scheduling_policy.cc
new file mode 100644
index 000000000..5d3591901
--- /dev/null
+++ b/src/gpu-compute/rr_scheduling_policy.cc
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/rr_scheduling_policy.hh"
+
+#include "gpu-compute/wavefront.hh"
+
+Wavefront*
+RRSchedulingPolicy::chooseWave()
+{
+    Wavefront *selectedWave = nullptr;
+
+    // Check to make sure ready list had atleast one schedulable wave
+    if (scheduleList->size()) {
+        // For RR policy, select the wave which is at the
+        // front of the list. The selected wave is popped
+        // out from the schedule list immediately after selection
+        // to avoid starvation. It is the responsibility of the
+        // module invoking the RR scheduler to make surei scheduling
+        // eligible waves are added to the back of the schedule
+        // list
+        selectedWave = scheduleList->front();
+        scheduleList->erase(scheduleList->begin() + 0);
+    } else {
+        panic("Empty ready list");
+    }
+
+    return selectedWave;
+}
+
+void
+RRSchedulingPolicy::bindList(std::vector<Wavefront*> *list)
+{
+    scheduleList = list;
+}
diff --git a/src/gpu-compute/rr_scheduling_policy.hh b/src/gpu-compute/rr_scheduling_policy.hh
new file mode 100644
index 000000000..780f294aa
--- /dev/null
+++ b/src/gpu-compute/rr_scheduling_policy.hh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __RR_SCHEDULING_POLICY_HH__
+#define __RR_SCHEDULING_POLICY_HH__
+
+#include <inttypes.h>
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include "base/misc.hh"
+
+class Wavefront;
+
+// Round-Robin pick among the list of ready waves
+class RRSchedulingPolicy
+{
+  public:
+    RRSchedulingPolicy() : scheduleList(nullptr) { }
+
+    Wavefront* chooseWave();
+    void bindList(std::vector<Wavefront*> *list);
+
+  private:
+    // List of waves which are participating in scheduling.
+    // This scheduler selects one wave from this list based on
+    // round robin policy
+    std::vector<Wavefront*> *scheduleList;
+};
+
+#endif // __RR_SCHEDULING_POLICY_HH__
diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc
new file mode 100644
index 000000000..068136026
--- /dev/null
+++ b/src/gpu-compute/schedule_stage.cc
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/schedule_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+ScheduleStage::ScheduleStage(const ComputeUnitParams *p)
+    : numSIMDs(p->num_SIMDs),
+      numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes)
+{
+    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+        Scheduler newScheduler(p);
+        scheduler.push_back(newScheduler);
+    }
+}
+
+ScheduleStage::~ScheduleStage()
+{
+    scheduler.clear();
+    waveStatusList.clear();
+}
+
+void
+ScheduleStage::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    _name = computeUnit->name() + ".ScheduleStage";
+
+    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+        scheduler[j].bindList(&computeUnit->readyList[j]);
+    }
+
+    for (int j = 0; j < numSIMDs; ++j) {
+        waveStatusList.push_back(&computeUnit->waveStatusList[j]);
+    }
+
+    dispatchList = &computeUnit->dispatchList;
+}
+
+void
+ScheduleStage::arbitrate()
+{
+    // iterate over all Memory pipelines
+    for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) {
+        if (dispatchList->at(j).first) {
+            Wavefront *waveToMemPipe = dispatchList->at(j).first;
+            // iterate over all execution pipelines
+            for (int i = 0; i < numSIMDs + numMemUnits; ++i) {
+                if ((i != j) && (dispatchList->at(i).first)) {
+                    Wavefront *waveToExePipe = dispatchList->at(i).first;
+                    // if the two selected wavefronts are mapped to the same
+                    // SIMD unit then they share the VRF
+                    if (waveToMemPipe->simdId == waveToExePipe->simdId) {
+                        int simdId = waveToMemPipe->simdId;
+                        // Read VRF port arbitration:
+                        // If there are read VRF port conflicts between the
+                        // a memory and another instruction we drop the other
+                        // instruction. We don't need to check for write VRF
+                        // port conflicts because the memory instruction either
+                        // does not need to write to the VRF (store) or will
+                        // write to the VRF when the data comes back (load) in
+                        // which case the arbiter of the memory pipes will
+                        // resolve any conflicts
+                        if (computeUnit->vrf[simdId]->
+                            isReadConflict(waveToMemPipe->wfSlotId,
+                            waveToExePipe->wfSlotId)) {
+                            // FIXME: The "second" member variable is never
+                            // used in the model. I am setting it to READY
+                            // simply to follow the protocol of setting it
+                            // when the WF has an instruction ready to issue
+                            waveStatusList[simdId]->at(waveToExePipe->wfSlotId)
+                                                    .second = READY;
+
+                            dispatchList->at(i).first = nullptr;
+                            dispatchList->at(i).second = EMPTY;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void
+ScheduleStage::exec()
+{
+    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+         uint32_t readyListSize = computeUnit->readyList[j].size();
+
+         // If no wave is ready to be scheduled on the execution resource
+         // then skip scheduling for this execution resource
+         if (!readyListSize) {
+             continue;
+         }
+
+         Wavefront *waveToBeDispatched = scheduler[j].chooseWave();
+         dispatchList->at(j).first = waveToBeDispatched;
+         waveToBeDispatched->updateResources();
+         dispatchList->at(j).second = FILLED;
+
+         waveStatusList[waveToBeDispatched->simdId]->at(
+                 waveToBeDispatched->wfSlotId).second = BLOCKED;
+
+         assert(computeUnit->readyList[j].size() == readyListSize - 1);
+    }
+    // arbitrate over all shared resources among instructions being issued
+    // simultaneously
+    arbitrate();
+}
+
+void
+ScheduleStage::regStats()
+{
+}
diff --git a/src/gpu-compute/schedule_stage.hh b/src/gpu-compute/schedule_stage.hh
new file mode 100644
index 000000000..26eb9a25b
--- /dev/null
+++ b/src/gpu-compute/schedule_stage.hh
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCHEDULE_STAGE_HH__
+#define __SCHEDULE_STAGE_HH__
+
+#include <utility>
+#include <vector>
+
+#include "gpu-compute/exec_stage.hh"
+#include "gpu-compute/scheduler.hh"
+#include "gpu-compute/scoreboard_check_stage.hh"
+
+// Schedule or execution arbitration stage.
+// From the pool of ready waves in the ready list,
+// one wave is selected for each execution resource.
+// The selection is made based on a scheduling policy
+
+class ComputeUnit;
+class Wavefront;
+
+struct ComputeUnitParams;
+
+class ScheduleStage
+{
+  public:
+    ScheduleStage(const ComputeUnitParams *params);
+    ~ScheduleStage();
+    void init(ComputeUnit *cu);
+    void exec();
+    void arbitrate();
+    // Stats related variables and methods
+    std::string name() { return _name; }
+    void regStats();
+
+  private:
+    ComputeUnit *computeUnit;
+    uint32_t numSIMDs;
+    uint32_t numMemUnits;
+
+    // Each execution resource will have its own
+    // scheduler and a dispatch list
+    std::vector<Scheduler> scheduler;
+
+    // Stores the status of waves. A READY implies the
+    // wave is ready to be scheduled this cycle and
+    // is already present in the readyList
+    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
+        waveStatusList;
+
+    // List of waves which will be dispatched to
+    // each execution resource. A FILLED implies
+    // dispatch list is non-empty and
+    // execution unit has something to execute
+    // this cycle. Currently, the dispatch list of
+    // an execution resource can hold only one wave because
+    // an execution resource can execute only one wave in a cycle.
+    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
+
+    std::string _name;
+};
+
+#endif // __SCHEDULE_STAGE_HH__
diff --git a/src/gpu-compute/scheduler.cc b/src/gpu-compute/scheduler.cc
new file mode 100644
index 000000000..1cd0bfe55
--- /dev/null
+++ b/src/gpu-compute/scheduler.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/scheduler.hh"
+
+Scheduler::Scheduler(const ComputeUnitParams *p)
+{
+    if (p->execPolicy  == "OLDEST-FIRST") {
+        schedPolicy = SCHED_POLICY::OF_POLICY;
+    } else if (p->execPolicy  == "ROUND-ROBIN") {
+        schedPolicy = SCHED_POLICY::RR_POLICY;
+    } else {
+        fatal("Unimplemented scheduling policy");
+    }
+}
+
+Wavefront*
+Scheduler::chooseWave()
+{
+    if (schedPolicy == SCHED_POLICY::OF_POLICY) {
+        return OFSchedPolicy.chooseWave();
+    } else if (schedPolicy == SCHED_POLICY::RR_POLICY) {
+        return RRSchedPolicy.chooseWave();
+    } else {
+        fatal("Unimplemented scheduling policy");
+    }
+}
+
+void
+Scheduler::bindList(std::vector<Wavefront*> *list)
+{
+    if (schedPolicy == SCHED_POLICY::OF_POLICY) {
+        OFSchedPolicy.bindList(list);
+    } else if (schedPolicy == SCHED_POLICY::RR_POLICY) {
+        RRSchedPolicy.bindList(list);
+    } else {
+        fatal("Unimplemented scheduling policy");
+    }
+}
diff --git a/src/gpu-compute/scheduler.hh b/src/gpu-compute/scheduler.hh
new file mode 100644
index 000000000..148ec9425
--- /dev/null
+++ b/src/gpu-compute/scheduler.hh
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCHEDULER_HH__
+#define __SCHEDULER_HH__
+
+#include "gpu-compute/of_scheduling_policy.hh"
+#include "gpu-compute/rr_scheduling_policy.hh"
+#include "gpu-compute/scheduling_policy.hh"
+#include "params/ComputeUnit.hh"
+
+enum SCHED_POLICY
+{
+    OF_POLICY = 0,
+    RR_POLICY
+};
+
+class Scheduler
+{
+  public:
+    Scheduler(const ComputeUnitParams *params);
+    Wavefront *chooseWave();
+    void bindList(std::vector<Wavefront*> *list);
+
+  private:
+    SCHED_POLICY schedPolicy;
+    SchedulingPolicy<RRSchedulingPolicy> RRSchedPolicy;
+    SchedulingPolicy<OFSchedulingPolicy> OFSchedPolicy;
+};
+
+#endif // __SCHEDULER_HH__
diff --git a/src/gpu-compute/scheduling_policy.hh b/src/gpu-compute/scheduling_policy.hh
new file mode 100644
index 000000000..b5e923c62
--- /dev/null
+++ b/src/gpu-compute/scheduling_policy.hh
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCHEDULING_POLICY_HH__
+#define __SCHEDULING_POLICY_HH__
+
+#include <vector>
+
+template<typename Impl>
+class SchedulingPolicy
+{
+  public:
+    Wavefront* chooseWave() { return policyImpl.chooseWave(); }
+
+    void
+    bindList(std::vector<Wavefront*> *list)
+    {
+        return policyImpl.bindList(list);
+    }
+
+  private:
+    Impl policyImpl;
+};
+
+#endif // __SCHEDULING_POLICY_HH__
diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc
new file mode 100644
index 000000000..0d856a9b0
--- /dev/null
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/scoreboard_check_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/ComputeUnit.hh"
+
+ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p)
+    : numSIMDs(p->num_SIMDs),
+      numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
+      numGlbMemPipes(p->num_global_mem_pipes),
+      numShrMemPipes(p->num_shared_mem_pipes),
+      vectorAluInstAvail(nullptr),
+      lastGlbMemSimd(-1),
+      lastShrMemSimd(-1), glbMemInstAvail(nullptr),
+      shrMemInstAvail(nullptr)
+{
+}
+
+ScoreboardCheckStage::~ScoreboardCheckStage()
+{
+    readyList.clear();
+    waveStatusList.clear();
+    shrMemInstAvail = nullptr;
+    glbMemInstAvail = nullptr;
+}
+
+void
+ScoreboardCheckStage::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    _name = computeUnit->name() + ".ScoreboardCheckStage";
+
+    for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
+        readyList.push_back(&computeUnit->readyList[unitId]);
+    }
+
+    for (int unitId = 0; unitId < numSIMDs; ++unitId) {
+        waveStatusList.push_back(&computeUnit->waveStatusList[unitId]);
+    }
+
+    vectorAluInstAvail = &computeUnit->vectorAluInstAvail;
+    glbMemInstAvail= &computeUnit->glbMemInstAvail;
+    shrMemInstAvail= &computeUnit->shrMemInstAvail;
+}
+
+void
+ScoreboardCheckStage::initStatistics()
+{
+    lastGlbMemSimd = -1;
+    lastShrMemSimd = -1;
+    *glbMemInstAvail = 0;
+    *shrMemInstAvail = 0;
+
+    for (int unitId = 0; unitId < numSIMDs; ++unitId)
+        vectorAluInstAvail->at(unitId) = false;
+}
+
+void
+ScoreboardCheckStage::collectStatistics(Wavefront *curWave, int unitId)
+{
+    if (curWave->instructionBuffer.empty())
+        return;
+
+    // track which vector SIMD unit has at least one WV with a vector
+    // ALU as the oldest instruction in its Instruction buffer
+    vectorAluInstAvail->at(unitId) = vectorAluInstAvail->at(unitId) ||
+                                     curWave->isOldestInstALU();
+
+    // track how many vector SIMD units have at least one WV with a
+    // vector Global memory instruction as the oldest instruction
+    // in its Instruction buffer
+    if ((curWave->isOldestInstGMem() || curWave->isOldestInstPrivMem() ||
+         curWave->isOldestInstFlatMem()) && lastGlbMemSimd != unitId &&
+        *glbMemInstAvail <= 1) {
+        (*glbMemInstAvail)++;
+        lastGlbMemSimd = unitId;
+    }
+
+    // track how many vector SIMD units have at least one WV with a
+    // vector shared memory (LDS) instruction as the oldest instruction
+    // in its Instruction buffer
+    // TODO: parametrize the limit of the LDS units
+    if (curWave->isOldestInstLMem() && (*shrMemInstAvail <= numShrMemPipes) &&
+        lastShrMemSimd != unitId) {
+        (*shrMemInstAvail)++;
+        lastShrMemSimd = unitId;
+    }
+}
+
+void
+ScoreboardCheckStage::exec()
+{
+    initStatistics();
+
+    // reset the ready list for all execution units; it will be
+    // constructed every cycle since resource availability may change
+    for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
+        readyList[unitId]->clear();
+    }
+
+    // iterate over the Wavefronts of all SIMD units
+    for (int unitId = 0; unitId < numSIMDs; ++unitId) {
+        for (int wvId = 0; wvId < computeUnit->shader->n_wf; ++wvId) {
+            // reset the ready status of each wavefront
+            waveStatusList[unitId]->at(wvId).second = BLOCKED;
+            Wavefront *curWave = waveStatusList[unitId]->at(wvId).first;
+            collectStatistics(curWave, unitId);
+
+            if (curWave->ready(Wavefront::I_ALU)) {
+                readyList[unitId]->push_back(curWave);
+                waveStatusList[unitId]->at(wvId).second = READY;
+            } else if (curWave->ready(Wavefront::I_GLOBAL)) {
+                if (computeUnit->cedeSIMD(unitId, wvId)) {
+                    continue;
+                }
+
+                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
+                waveStatusList[unitId]->at(wvId).second = READY;
+            } else if (curWave->ready(Wavefront::I_SHARED)) {
+                readyList[computeUnit->ShrMemUnitId()]->push_back(curWave);
+                waveStatusList[unitId]->at(wvId).second = READY;
+            } else if (curWave->ready(Wavefront::I_FLAT)) {
+                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
+                waveStatusList[unitId]->at(wvId).second = READY;
+            } else if (curWave->ready(Wavefront::I_PRIVATE)) {
+                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
+                waveStatusList[unitId]->at(wvId).second = READY;
+            }
+        }
+    }
+}
+
+void
+ScoreboardCheckStage::regStats()
+{
+}
diff --git a/src/gpu-compute/scoreboard_check_stage.hh b/src/gpu-compute/scoreboard_check_stage.hh
new file mode 100644
index 000000000..099597afb
--- /dev/null
+++ b/src/gpu-compute/scoreboard_check_stage.hh
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCOREBOARD_CHECK_STAGE_HH__
+#define __SCOREBOARD_CHECK_STAGE_HH__
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+class ComputeUnit;
+class Wavefront;
+
+struct ComputeUnitParams;
+
+enum WAVE_STATUS
+{
+    BLOCKED = 0,
+    READY
+};
+
+/*
+ * Scoreboard check stage.
+ * All wavefronts are analyzed to see if they are ready
+ * to be executed this cycle. Both structural and data
+ * hazards are considered while marking a wave "ready"
+ * for execution. After analysis, the ready waves are
+ * added to readyList.
+ */
+class ScoreboardCheckStage
+{
+  public:
+    ScoreboardCheckStage(const ComputeUnitParams* params);
+    ~ScoreboardCheckStage();
+    void init(ComputeUnit *cu);
+    void exec();
+
+    // Stats related variables and methods
+    const std::string& name() const { return _name; }
+    void regStats();
+
+  private:
+    void collectStatistics(Wavefront *curWave, int unitId);
+    void initStatistics();
+    ComputeUnit *computeUnit;
+    uint32_t numSIMDs;
+    uint32_t numMemUnits;
+    uint32_t numGlbMemPipes;
+    uint32_t numShrMemPipes;
+
+    // flag per vector SIMD unit that is set when there is at least one
+    // WF that has a vector ALU instruction as the oldest in its
+    // Instruction Buffer
+    std::vector<bool> *vectorAluInstAvail;
+    int lastGlbMemSimd;
+    int lastShrMemSimd;
+
+    int *glbMemInstAvail;
+    int *shrMemInstAvail;
+    // List of waves which are ready to be scheduled.
+    // Each execution resource has a ready list
+    std::vector<std::vector<Wavefront*>*> readyList;
+
+    // Stores the status of waves. A READY implies the
+    // wave is ready to be scheduled this cycle and
+    // is already present in the readyList
+    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
+        waveStatusList;
+
+    std::string _name;
+};
+
+#endif // __SCOREBOARD_CHECK_STAGE_HH__
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
new file mode 100644
index 000000000..e8d7946ff
--- /dev/null
+++ b/src/gpu-compute/shader.cc
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "gpu-compute/shader.hh"
+
+#include <limits>
+
+#include "arch/x86/linux/linux.hh"
+#include "base/chunk_generator.hh"
+#include "debug/GPUDisp.hh"
+#include "debug/GPUMem.hh"
+#include "debug/HSAIL.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "sim/sim_exit.hh"
+
+Shader::Shader(const Params *p) : SimObject(p),
+    clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr),
+    cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing),
+    hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync),
+    separate_acquire_release(p->separate_acquire_release), coissue_return(1),
+    trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
+    globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
+    box_tick_cnt(0), start_tick_cnt(0)
+{
+
+    cuList.resize(n_cu);
+
+    for (int i = 0; i < n_cu; ++i) {
+        cuList[i] = p->CUs[i];
+        assert(i == cuList[i]->cu_id);
+        cuList[i]->shader = this;
+    }
+}
+
+Addr
+Shader::mmap(int length)
+{
+
+    Addr start;
+
+    // round up length to the next page
+    length = roundUp(length, TheISA::PageBytes);
+
+    if (X86Linux64::mmapGrowsDown()) {
+        DPRINTF(HSAIL, "GROWS DOWN");
+        start = gpuTc->getProcessPtr()->mmap_end -length;
+        gpuTc->getProcessPtr()->mmap_end = start;
+    } else {
+        DPRINTF(HSAIL, "GROWS UP");
+        start = gpuTc->getProcessPtr()->mmap_end;
+        gpuTc->getProcessPtr()->mmap_end += length;
+
+        // assertion to make sure we don't overwrite the stack (it grows down)
+        assert(gpuTc->getProcessPtr()->mmap_end <
+                gpuTc->getProcessPtr()->stack_base -
+                gpuTc->getProcessPtr()->max_stack_size);
+
+    }
+
+    DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
+
+    gpuTc->getProcessPtr()->allocateMem(start,length);
+
+    return start;
+}
+
+void
+Shader::init()
+{
+    // grab the threadContext of the thread running on the CPU
+    assert(cpuPointer);
+    gpuTc = cpuPointer->getContext(0);
+    assert(gpuTc);
+}
+
+Shader::~Shader()
+{
+    for (int j = 0; j < n_cu; ++j)
+        delete cuList[j];
+}
+
+void
+Shader::updateThreadContext(int tid) {
+    // thread context of the thread which dispatched work
+    assert(cpuPointer);
+    gpuTc = cpuPointer->getContext(tid);
+    assert(gpuTc);
+}
+
+void
+Shader::hostWakeUp(BaseCPU *cpu) {
+    if (cpuPointer == cpu) {
+        if (gpuTc->status() == ThreadContext::Suspended)
+            cpu->activateContext(gpuTc->threadId());
+    } else {
+        //Make sure both dispatcher and shader are trying to
+        //wakeup same host. Hack here to enable kernel launch
+        //from multiple CPUs
+        panic("Dispatcher wants to wakeup a different host");
+    }
+}
+
+Shader*
+ShaderParams::create()
+{
+    return new Shader(this);
+}
+
+void
+Shader::exec()
+{
+    tick_cnt = curTick();
+    box_tick_cnt = curTick() - start_tick_cnt;
+
+    // apply any scheduled adds
+    for (int i = 0; i < sa_n; ++i) {
+        if (sa_when[i] <= tick_cnt) {
+            *sa_val[i] += sa_x[i];
+            sa_val.erase(sa_val.begin() + i);
+            sa_x.erase(sa_x.begin() + i);
+            sa_when.erase(sa_when.begin() + i);
+            --sa_n;
+            --i;
+        }
+    }
+
+    // clock all of the cu's
+    for (int i = 0; i < n_cu; ++i)
+        cuList[i]->exec();
+}
+
+bool
+Shader::dispatch_workgroups(NDRange *ndr)
+{
+    bool scheduledSomething = false;
+    int cuCount = 0;
+    int curCu = nextSchedCu;
+
+    while (cuCount < n_cu) {
+        //Every time we try a CU, update nextSchedCu
+        nextSchedCu = (nextSchedCu + 1) % n_cu;
+
+        // dispatch workgroup iff the following two conditions are met:
+        // (a) wg_rem is true - there are unassigned workgroups in the grid
+        // (b) there are enough free slots in cu cuList[i] for this wg
+        if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
+            scheduledSomething = true;
+            DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
+
+            // ticks() member function translates cycles to simulation ticks.
+            if (!tickEvent.scheduled()) {
+                schedule(tickEvent, curTick() + this->ticks(1));
+            }
+
+            cuList[curCu]->StartWorkgroup(ndr);
+            ndr->wgId[0]++;
+            ndr->globalWgId++;
+            if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
+                ndr->wgId[0] = 0;
+                ndr->wgId[1]++;
+
+                if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
+                    ndr->wgId[1] = 0;
+                    ndr->wgId[2]++;
+
+                    if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
+                        ndr->wg_disp_rem = false;
+                        break;
+                    }
+                }
+            }
+        }
+
+        ++cuCount;
+        curCu = nextSchedCu;
+    }
+
+    return scheduledSomething;
+}
+
+void
+Shader::handshake(GpuDispatcher *_dispatcher)
+{
+    dispatcher = _dispatcher;
+}
+
+void
+Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
+                           bool suppress_func_errors, int cu_id)
+{
+    unsigned block_size = RubySystem::getBlockSizeBytes();
+    unsigned size = req->getSize();
+
+    Addr tmp_addr;
+    BaseTLB::Mode trans_mode;
+
+    if (cmd == MemCmd::ReadReq) {
+        trans_mode = BaseTLB::Read;
+    } else if (cmd == MemCmd::WriteReq) {
+        trans_mode = BaseTLB::Write;
+    } else {
+        fatal("unexcepted MemCmd\n");
+    }
+
+    tmp_addr = req->getVaddr();
+    Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
+
+    assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
+
+    // Misaligned access
+    if (split_addr > tmp_addr) {
+        RequestPtr req1, req2;
+        req->splitOnVaddr(split_addr, req1, req2);
+
+
+        PacketPtr pkt1 = new Packet(req2, cmd);
+        PacketPtr pkt2 = new Packet(req1, cmd);
+
+        functionalTLBAccess(pkt1, cu_id, trans_mode);
+        functionalTLBAccess(pkt2, cu_id, trans_mode);
+
+        PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
+        PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
+
+        new_pkt1->dataStatic(data);
+        new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
+
+        if (suppress_func_errors) {
+            new_pkt1->setSuppressFuncError();
+            new_pkt2->setSuppressFuncError();
+        }
+
+        // fixme: this should be cuList[cu_id] if cu_id != n_cu
+        // The latter requires a memPort in the dispatcher
+        cuList[0]->memPort[0]->sendFunctional(new_pkt1);
+        cuList[0]->memPort[0]->sendFunctional(new_pkt2);
+
+        delete new_pkt1;
+        delete new_pkt2;
+        delete pkt1;
+        delete pkt2;
+    } else {
+        PacketPtr pkt = new Packet(req, cmd);
+        functionalTLBAccess(pkt, cu_id, trans_mode);
+        PacketPtr new_pkt = new Packet(pkt->req, cmd);
+        new_pkt->dataStatic(data);
+
+        if (suppress_func_errors) {
+            new_pkt->setSuppressFuncError();
+        };
+
+        // fixme: this should be cuList[cu_id] if cu_id != n_cu
+        // The latter requires a memPort in the dispatcher
+        cuList[0]->memPort[0]->sendFunctional(new_pkt);
+
+        delete new_pkt;
+        delete pkt;
+    }
+}
+
+bool
+Shader::busy()
+{
+    for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
+        if (!cuList[i_cu]->isDone()) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+void
+Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
+{
+    sa_val.push_back(val);
+    sa_when.push_back(tick_cnt + when);
+    sa_x.push_back(x);
+    ++sa_n;
+}
+
+Shader::TickEvent::TickEvent(Shader *_shader)
+    : Event(CPU_Tick_Pri), shader(_shader)
+{
+}
+
+
+void
+Shader::TickEvent::process()
+{
+    if (shader->busy()) {
+        shader->exec();
+        shader->schedule(this, curTick() + shader->ticks(1));
+    }
+}
+
+const char*
+Shader::TickEvent::description() const
+{
+    return "Shader tick";
+}
+
+void
+Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+                  MemCmd cmd, bool suppress_func_errors)
+{
+    uint8_t *data_buf = (uint8_t*)ptr;
+
+    for (ChunkGenerator gen(address, size, RubySystem::getBlockSizeBytes());
+         !gen.done(); gen.next()) {
+        Request *req = new Request(0, gen.addr(), gen.size(), 0,
+                                   cuList[0]->masterId(), 0, 0, 0);
+
+        doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
+        data_buf += gen.size();
+        delete req;
+    }
+}
+
+void
+Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
+{
+    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
+}
+
+void
+Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+                bool suppress_func_errors)
+{
+    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
+}
+
+void
+Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
+{
+    AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
+}
+
+void
+Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+                 bool suppress_func_errors)
+{
+    AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
+              suppress_func_errors);
+}
+
+/*
+ * Send a packet through the appropriate TLB functional port.
+ * If cu_id=n_cu, then this is the dispatcher's TLB.
+ * Otherwise it's the TLB of the cu_id compute unit.
+ */
+void
+Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
+{
+    // update senderState. Need to know the gpuTc and the TLB mode
+    pkt->senderState =
+        new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
+
+    if (cu_id == n_cu) {
+        dispatcher->tlbPort->sendFunctional(pkt);
+    } else {
+        // even when the perLaneTLB flag is turned on
+        // it's ok tp send all accesses through lane 0
+        // since the lane # is not known here,
+        // This isn't important since these are functional accesses.
+        cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
+    }
+
+    /* safe_cast the senderState */
+    TheISA::GpuTLB::TranslationState *sender_state =
+               safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    delete sender_state->tlbEntry;
+    delete pkt->senderState;
+}
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
new file mode 100644
index 000000000..91ea8aae0
--- /dev/null
+++ b/src/gpu-compute/shader.hh
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __SHADER_HH__
+#define __SHADER_HH__
+
+#include <functional>
+#include <string>
+
+#include "arch/isa.hh"
+#include "arch/isa_traits.hh"
+#include "base/types.hh"
+#include "cpu/simple/atomic.hh"
+#include "cpu/simple/timing.hh"
+#include "cpu/simple_thread.hh"
+#include "cpu/thread_context.hh"
+#include "cpu/thread_state.hh"
+#include "enums/MemOpType.hh"
+#include "enums/MemType.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_tlb.hh"
+#include "gpu-compute/lds_state.hh"
+#include "gpu-compute/qstruct.hh"
+#include "mem/page_table.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/Shader.hh"
+#include "sim/faults.hh"
+#include "sim/process.hh"
+#include "sim/sim_object.hh"
+
+class BaseTLB;
+class GpuDispatcher;
+
+namespace TheISA
+{
+    class GpuTLB;
+}
+
+static const int LDS_SIZE = 65536;
+
+// Class Shader: This describes a single shader instance. Most
+// configurations will only have a single shader.
+
+class Shader : public SimObject
+{
+  protected:
+      // Shader's clock period in terms of number of ticks of curTime,
+      // aka global simulation clock
+      Tick clock;
+
+  public:
+    typedef ShaderParams Params;
+    enum hsail_mode_e {SIMT,VECTOR_SCALAR};
+
+    // clock related functions ; maps to-and-from
+    // Simulation ticks and shader clocks.
+    Tick frequency() const { return SimClock::Frequency / clock; }
+
+    Tick ticks(int numCycles) const { return  (Tick)clock * numCycles; }
+
+    Tick getClock() const { return clock; }
+    Tick curCycle() const { return curTick() / clock; }
+    Tick tickToCycles(Tick val) const { return val / clock;}
+
+
+    SimpleThread *cpuThread;
+    ThreadContext *gpuTc;
+    BaseCPU *cpuPointer;
+
+    class TickEvent : public Event
+    {
+      private:
+        Shader *shader;
+
+      public:
+        TickEvent(Shader*);
+        void process();
+        const char* description() const;
+    };
+
+    TickEvent tickEvent;
+
+    // is this simulation going to be timing mode in the memory?
+    bool timingSim;
+    hsail_mode_e hsail_mode;
+
+    // If set, issue acq packet @ kernel launch
+    int impl_kern_boundary_sync;
+    // If set, generate a separate packet for acquire/release on
+    // ld_acquire/st_release/atomic operations
+    int separate_acquire_release;
+    // If set, fetch returns may be coissued with instructions
+    int coissue_return;
+    // If set, always dump all 64 gprs to trace
+    int trace_vgpr_all;
+    // Number of cu units in the shader
+    int n_cu;
+    // Number of wavefront slots per cu
+    int n_wf;
+    // The size of global memory
+    int globalMemSize;
+
+    /*
+     * Bytes/work-item for call instruction
+     * The number of arguments for an hsail function will
+     * vary. We simply determine the maximum # of arguments
+     * required by any hsail function up front before the
+     * simulation (during parsing of the Brig) and record
+     * that number here.
+     */
+    int funcargs_size;
+
+    // Tracks CU that rr dispatcher should attempt scheduling
+    int nextSchedCu;
+
+    // Size of scheduled add queue
+    uint32_t sa_n;
+
+    // Pointer to value to be increments
+    std::vector<uint32_t*> sa_val;
+    // When to do the increment
+    std::vector<uint64_t> sa_when;
+    // Amount to increment by
+    std::vector<int32_t> sa_x;
+
+    // List of Compute Units (CU's)
+    std::vector<ComputeUnit*> cuList;
+
+    uint64_t tick_cnt;
+    uint64_t box_tick_cnt;
+    uint64_t start_tick_cnt;
+
+    GpuDispatcher *dispatcher;
+
+    Shader(const Params *p);
+    ~Shader();
+    virtual void init();
+
+    // Run shader
+    void exec();
+
+    // Check to see if shader is busy
+    bool busy();
+
+    // Schedule a 32-bit value to be incremented some time in the future
+    void ScheduleAdd(uint32_t *val, Tick when, int x);
+    bool processTimingPacket(PacketPtr pkt);
+
+    void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+                   MemCmd cmd, bool suppress_func_errors);
+
+    void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
+
+    void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
+                 bool suppress_func_errors);
+
+    void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
+
+    void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
+                  bool suppress_func_errors);
+
+    void doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
+                            bool suppress_func_errors, int cu_id);
+
+    void
+    registerCU(int cu_id, ComputeUnit *compute_unit)
+    {
+        cuList[cu_id] = compute_unit;
+    }
+
+    void handshake(GpuDispatcher *dispatcher);
+    bool dispatch_workgroups(NDRange *ndr);
+    Addr mmap(int length);
+    void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
+    void updateThreadContext(int tid);
+    void hostWakeUp(BaseCPU *cpu);
+};
+
+#endif // __SHADER_HH__
diff --git a/src/gpu-compute/simple_pool_manager.cc b/src/gpu-compute/simple_pool_manager.cc
new file mode 100644
index 000000000..0e35ab9cc
--- /dev/null
+++ b/src/gpu-compute/simple_pool_manager.cc
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/simple_pool_manager.hh"
+
+#include "base/misc.hh"
+
+// return the min number of elements that the manager can reserve given
+// a request for "size" elements
+uint32_t
+SimplePoolManager::minAllocatedElements(uint32_t size)
+{
+    fatal_if(size <= 0 || size > poolSize(), "Illegal VGPR region size=%d\n",
+             size);
+
+    return size % minAllocation() > 0 ?
+        (minAllocation() - (size % minAllocation())) + size : size;
+}
+
+std::string
+SimplePoolManager::printRegion()
+{
+    std::string _cout;
+    if (_reservedGroups == 0)
+        _cout = "VRF is empty\n";
+    else if (_reservedGroups > 0) {
+        uint32_t reservedEntries = _reservedGroups * _regionSize;
+        _cout = "VRF reserves " + std::to_string(reservedEntries) + " VGPRs\n";
+    }
+
+    return _cout;
+}
+
+bool
+SimplePoolManager::canAllocate(uint32_t numRegions, uint32_t size)
+{
+    assert(numRegions * minAllocatedElements(size) <= poolSize());
+
+    return _reservedGroups == 0;
+}
+
+void
+SimplePoolManager::freeRegion(uint32_t firstIdx, uint32_t lastIdx)
+{
+    assert(_reservedGroups > 0);
+    --_reservedGroups;
+
+    if (!_reservedGroups)
+        _nxtFreeIdx = 0;
+}
+
+uint32_t
+SimplePoolManager::allocateRegion(const uint32_t size,
+                                  uint32_t *reservedPoolSize)
+{
+    uint32_t actualSize = minAllocatedElements(size);
+    uint32_t startIdx = _nxtFreeIdx;
+    _nxtFreeIdx += actualSize;
+    _regionSize = actualSize;
+    assert(_nxtFreeIdx < poolSize());
+    *reservedPoolSize = actualSize;
+    ++_reservedGroups;
+
+    return startIdx;
+}
+
+uint32_t
+SimplePoolManager::regionSize(std::pair<uint32_t, uint32_t> &region)
+{
+    bool wrapAround = (region.first > region.second);
+    if (!wrapAround) {
+        return region.second - region.first + 1;
+    } else {
+        return region.second + poolSize() - region.first + 1;
+    }
+}
diff --git a/src/gpu-compute/simple_pool_manager.hh b/src/gpu-compute/simple_pool_manager.hh
new file mode 100644
index 000000000..1d4174da8
--- /dev/null
+++ b/src/gpu-compute/simple_pool_manager.hh
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __SIMPLE_POOL_MANAGER_HH__
+#define __SIMPLE_POOL_MANAGER_HH__
+
+#include <cassert>
+#include <cstdint>
+
+#include "gpu-compute/pool_manager.hh"
+
+// Simple Pool Manager: allows one region per pool. No region merging is
+// supported.
+class SimplePoolManager : public PoolManager
+{
+  public:
+    SimplePoolManager(uint32_t minAlloc, uint32_t poolSize)
+        : PoolManager(minAlloc, poolSize), _regionSize(0), _nxtFreeIdx(0),
+          _reservedGroups(0)
+    {
+    }
+
+    uint32_t minAllocatedElements(uint32_t size);
+    std::string printRegion();
+    bool canAllocate(uint32_t numRegions, uint32_t size);
+    uint32_t allocateRegion(const uint32_t size, uint32_t *reservedPoolSize);
+    void freeRegion(uint32_t firstIdx, uint32_t lastIdx);
+    uint32_t regionSize(std::pair<uint32_t,uint32_t> &region);
+
+  private:
+    // actual size of a region (normalized to the minimum size that can
+    // be reserved)
+    uint32_t _regionSize;
+    // next index to allocate a region
+    uint8_t _nxtFreeIdx;
+    // number of groups that reserve a region
+    uint32_t _reservedGroups;
+};
+
+#endif // __SIMPLE_POOL_MANAGER_HH__
diff --git a/src/gpu-compute/tlb_coalescer.cc b/src/gpu-compute/tlb_coalescer.cc
new file mode 100644
index 000000000..835d7b740
--- /dev/null
+++ b/src/gpu-compute/tlb_coalescer.cc
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#include "gpu-compute/tlb_coalescer.hh"
+
+#include <cstring>
+
+#include "debug/GPUTLB.hh"
+
+TLBCoalescer::TLBCoalescer(const Params *p) : MemObject(p),
+    clock(p->clk_domain->clockPeriod()), TLBProbesPerCycle(p->probesPerCycle),
+    coalescingWindow(p->coalescingWindow),
+    disableCoalescing(p->disableCoalescing), probeTLBEvent(this),
+    cleanupEvent(this)
+{
+    // create the slave ports based on the number of connected ports
+    for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
+        cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
+                                              this, i));
+    }
+
+    // create the master ports based on the number of connected ports
+    for (size_t i = 0; i < p->port_master_connection_count; ++i) {
+        memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
+                                              this, i));
+    }
+}
+
+BaseSlavePort&
+TLBCoalescer::getSlavePort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "slave") {
+        if (idx >= static_cast<PortID>(cpuSidePort.size())) {
+            panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx);
+        }
+
+        return *cpuSidePort[idx];
+    } else {
+        panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name);
+    }
+}
+
+BaseMasterPort&
+TLBCoalescer::getMasterPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "master") {
+        if (idx >= static_cast<PortID>(memSidePort.size())) {
+            panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx);
+        }
+
+        return *memSidePort[idx];
+    } else {
+        panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name);
+    }
+}
+
+/*
+ * This method returns true if the <incoming_pkt>
+ * can be coalesced with <coalesced_pkt> and false otherwise.
+ * A given set of rules is checked.
+ * The rules can potentially be modified based on the TLB level.
+ */
+bool
+TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
+{
+    if (disableCoalescing)
+        return false;
+
+    TheISA::GpuTLB::TranslationState *incoming_state =
+      safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState);
+
+    TheISA::GpuTLB::TranslationState *coalesced_state =
+     safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState);
+
+    // Rule 1: Coalesce requests only if they
+    // fall within the same virtual page
+    Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
+                                             TheISA::PageBytes);
+
+    Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
+                                              TheISA::PageBytes);
+
+    if (incoming_virt_page_addr != coalesced_virt_page_addr)
+        return false;
+
+    //* Rule 2: Coalesce requests only if they
+    // share a TLB Mode, i.e. they are both read
+    // or write requests.
+    BaseTLB::Mode incoming_mode = incoming_state->tlbMode;
+    BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode;
+
+    if (incoming_mode != coalesced_mode)
+        return false;
+
+    // when we can coalesce a packet update the reqCnt
+    // that is the number of packets represented by
+    // this coalesced packet
+    if (!incoming_state->prefetch)
+        coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
+
+    return true;
+}
+
+/*
+ * We need to update the physical addresses of all the translation requests
+ * that were coalesced into the one that just returned.
+ */
+void
+TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
+{
+    Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
+
+    DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
+            issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
+
+    TheISA::GpuTLB::TranslationState *sender_state =
+        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    TheISA::GpuTlbEntry *tlb_entry = sender_state->tlbEntry;
+    assert(tlb_entry);
+    Addr first_entry_vaddr = tlb_entry->vaddr;
+    Addr first_entry_paddr = tlb_entry->paddr;
+    int page_size = tlb_entry->size();
+    bool uncacheable = tlb_entry->uncacheable;
+    int first_hit_level = sender_state->hitLevel;
+    bool valid = tlb_entry->valid;
+
+    // Get the physical page address of the translated request
+    // Using the page_size specified in the TLBEntry allows us
+    // to support different page sizes.
+    Addr phys_page_paddr = pkt->req->getPaddr();
+    phys_page_paddr &= ~(page_size - 1);
+
+    for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
+        PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
+        TheISA::GpuTLB::TranslationState *sender_state =
+            safe_cast<TheISA::GpuTLB::TranslationState*>(
+                    local_pkt->senderState);
+
+        // we are sending the packet back, so pop the reqCnt associated
+        // with this level in the TLB hiearchy
+        if (!sender_state->prefetch)
+            sender_state->reqCnt.pop_back();
+
+        /*
+         * Only the first packet from this coalesced request has been
+         * translated. Grab the translated phys. page addr and update the
+         * physical addresses of the remaining packets with the appropriate
+         * page offsets.
+         */
+        if (i) {
+            Addr paddr = phys_page_paddr;
+            paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
+            local_pkt->req->setPaddr(paddr);
+
+            if (uncacheable)
+                local_pkt->req->setFlags(Request::UNCACHEABLE);
+
+            // update senderState->tlbEntry, so we can insert
+            // the correct TLBEentry in the TLBs above.
+            sender_state->tlbEntry =
+                new TheISA::GpuTlbEntry(0, first_entry_vaddr, first_entry_paddr,
+                                        valid);
+
+            // update the hitLevel for all uncoalesced reqs
+            // so that each packet knows where it hit
+            // (used for statistics in the CUs)
+            sender_state->hitLevel = first_hit_level;
+        }
+
+        SlavePort *return_port = sender_state->ports.back();
+        sender_state->ports.pop_back();
+
+        // Translation is done - Convert to a response pkt if necessary and
+        // send the translation back
+        if (local_pkt->isRequest()) {
+            local_pkt->makeTimingResponse();
+        }
+
+        return_port->sendTimingResp(local_pkt);
+    }
+
+    // schedule clean up for end of this cycle
+    // This is a maximum priority event and must be on
+    // the same cycle as GPUTLB cleanup event to prevent
+    // race conditions with an IssueProbeEvent caused by
+    // MemSidePort::recvReqRetry
+    cleanupQueue.push(virt_page_addr);
+
+    if (!cleanupEvent.scheduled())
+        schedule(cleanupEvent, curTick());
+}
+
+// Receive translation requests, create a coalesced request,
+// and send them to the TLB (TLBProbesPerCycle)
+bool
+TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
+{
+    // first packet of a coalesced request
+    PacketPtr first_packet = nullptr;
+    // true if we are able to do coalescing
+    bool didCoalesce = false;
+    // number of coalesced reqs for a given window
+    int coalescedReq_cnt = 0;
+
+    TheISA::GpuTLB::TranslationState *sender_state =
+        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    // push back the port to remember the path back
+    sender_state->ports.push_back(this);
+
+    bool update_stats = !sender_state->prefetch;
+
+    if (update_stats) {
+        // if reqCnt is empty then this packet does not represent
+        // multiple uncoalesced reqs(pkts) but just a single pkt.
+        // If it does though then the reqCnt for each level in the
+        // hierarchy accumulates the total number of reqs this packet
+        // represents
+        int req_cnt = 1;
+
+        if (!sender_state->reqCnt.empty())
+            req_cnt = sender_state->reqCnt.back();
+
+        sender_state->reqCnt.push_back(req_cnt);
+
+        // update statistics
+        coalescer->uncoalescedAccesses++;
+        req_cnt = sender_state->reqCnt.back();
+        DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
+        coalescer->queuingCycles -= (curTick() * req_cnt);
+        coalescer->localqueuingCycles -= curTick();
+    }
+
+    // FIXME if you want to coalesce not based on the issueTime
+    // of the packets (i.e., from the compute unit's perspective)
+    // but based on when they reached this coalescer then
+    // remove the following if statement and use curTick() or
+    // coalescingWindow for the tick_index.
+    if (!sender_state->issueTime)
+       sender_state->issueTime = curTick();
+
+    // The tick index is used as a key to the coalescerFIFO hashmap.
+    // It is shared by all candidates that fall within the
+    // given coalescingWindow.
+    int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
+
+    if (coalescer->coalescerFIFO.count(tick_index)) {
+        coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
+    }
+
+    // see if we can coalesce the incoming pkt with another
+    // coalesced request with the same tick_index
+    for (int i = 0; i < coalescedReq_cnt; ++i) {
+        first_packet = coalescer->coalescerFIFO[tick_index][i][0];
+
+        if (coalescer->canCoalesce(pkt, first_packet)) {
+            coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
+
+            DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
+                    i, tick_index,
+                    coalescer->coalescerFIFO[tick_index][i].size());
+
+            didCoalesce = true;
+            break;
+        }
+    }
+
+    // if this is the first request for this tick_index
+    // or we did not manage to coalesce, update stats
+    // and make necessary allocations.
+    if (!coalescedReq_cnt || !didCoalesce) {
+        if (update_stats)
+            coalescer->coalescedAccesses++;
+
+        std::vector<PacketPtr> new_array;
+        new_array.push_back(pkt);
+        coalescer->coalescerFIFO[tick_index].push_back(new_array);
+
+        DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
+                "push\n", tick_index,
+                coalescer->coalescerFIFO[tick_index].size());
+    }
+
+    //schedule probeTLBEvent next cycle to send the
+    //coalesced requests to the TLB
+    if (!coalescer->probeTLBEvent.scheduled()) {
+        coalescer->schedule(coalescer->probeTLBEvent,
+                curTick() + coalescer->ticks(1));
+    }
+
+    return true;
+}
+
+void
+TLBCoalescer::CpuSidePort::recvReqRetry()
+{
+    assert(false);
+}
+
+void
+TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
+{
+
+    TheISA::GpuTLB::TranslationState *sender_state =
+        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    bool update_stats = !sender_state->prefetch;
+
+    if (update_stats)
+        coalescer->uncoalescedAccesses++;
+
+    // If there is a pending timing request for this virtual address
+    // print a warning message. This is a temporary caveat of
+    // the current simulator where atomic and timing requests can
+    // coexist. FIXME remove this check/warning in the future.
+    Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
+    int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
+
+    if (map_count) {
+        DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
+                "req. pending\n", virt_page_addr);
+    }
+
+    coalescer->memSidePort[0]->sendFunctional(pkt);
+}
+
+AddrRangeList
+TLBCoalescer::CpuSidePort::getAddrRanges() const
+{
+    // currently not checked by the master
+    AddrRangeList ranges;
+
+    return ranges;
+}
+
+bool
+TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt)
+{
+    // a translation completed and returned
+    coalescer->updatePhysAddresses(pkt);
+
+    return true;
+}
+
+void
+TLBCoalescer::MemSidePort::recvReqRetry()
+{
+    //we've receeived a retry. Schedule a probeTLBEvent
+    if (!coalescer->probeTLBEvent.scheduled())
+        coalescer->schedule(coalescer->probeTLBEvent,
+                curTick() + coalescer->ticks(1));
+}
+
+void
+TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt)
+{
+    fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
+}
+
+TLBCoalescer::IssueProbeEvent::IssueProbeEvent(TLBCoalescer * _coalescer)
+    : Event(CPU_Tick_Pri), coalescer(_coalescer)
+{
+}
+
+const char*
+TLBCoalescer::IssueProbeEvent::description() const
+{
+    return "Probe the TLB below";
+}
+
+/*
+ * Here we scan the coalescer FIFO and issue the max
+ * number of permitted probes to the TLB below. We
+ * permit bypassing of coalesced requests for the same
+ * tick_index.
+ *
+ * We do not access the next tick_index unless we've
+ * drained the previous one. The coalesced requests
+ * that are successfully sent are moved to the
+ * issuedTranslationsTable table (the table which keeps
+ * track of the outstanding reqs)
+ */
+void
+TLBCoalescer::IssueProbeEvent::process()
+{
+    // number of TLB probes sent so far
+    int sent_probes = 0;
+    // rejected denotes a blocking event
+    bool rejected = false;
+
+    // It is set to true either when the recvTiming of the TLB below
+    // returns false or when there is another outstanding request for the
+    // same virt. page.
+
+    DPRINTF(GPUTLB, "triggered TLBCoalescer IssueProbeEvent\n");
+
+    for (auto iter = coalescer->coalescerFIFO.begin();
+         iter != coalescer->coalescerFIFO.end() && !rejected; ) {
+        int coalescedReq_cnt = iter->second.size();
+        int i = 0;
+        int vector_index = 0;
+
+        DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
+               coalescedReq_cnt, iter->first);
+
+        while (i < coalescedReq_cnt) {
+            ++i;
+            PacketPtr first_packet = iter->second[vector_index][0];
+
+            // compute virtual page address for this request
+            Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
+                    TheISA::PageBytes);
+
+            // is there another outstanding request for the same page addr?
+            int pending_reqs =
+                coalescer->issuedTranslationsTable.count(virt_page_addr);
+
+            if (pending_reqs) {
+                DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
+                        "page %#x\n", virt_page_addr);
+
+                ++vector_index;
+                rejected = true;
+
+                continue;
+            }
+
+            // send the coalesced request for virt_page_addr
+            if (!coalescer->memSidePort[0]->sendTimingReq(first_packet)) {
+                DPRINTF(GPUTLB, "Failed to send TLB request for page %#x",
+                       virt_page_addr);
+
+                // No need for a retries queue since we are already buffering
+                // the coalesced request in coalescerFIFO.
+                rejected = true;
+                ++vector_index;
+            } else {
+                TheISA::GpuTLB::TranslationState *tmp_sender_state =
+                    safe_cast<TheISA::GpuTLB::TranslationState*>
+                    (first_packet->senderState);
+
+                bool update_stats = !tmp_sender_state->prefetch;
+
+                if (update_stats) {
+                    // req_cnt is total number of packets represented
+                    // by the one we just sent counting all the way from
+                    // the top of TLB hiearchy (i.e., from the CU)
+                    int req_cnt = tmp_sender_state->reqCnt.back();
+                    coalescer->queuingCycles += (curTick() * req_cnt);
+
+                    DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
+                            coalescer->name(), req_cnt);
+
+                    // pkt_cnt is number of packets we coalesced into the one
+                    // we just sent but only at this coalescer level
+                    int pkt_cnt = iter->second[vector_index].size();
+                    coalescer->localqueuingCycles += (curTick() * pkt_cnt);
+                }
+
+                DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
+                       virt_page_addr);
+
+                //copy coalescedReq to issuedTranslationsTable
+                coalescer->issuedTranslationsTable[virt_page_addr]
+                    = iter->second[vector_index];
+
+                //erase the entry of this coalesced req
+                iter->second.erase(iter->second.begin() + vector_index);
+
+                if (iter->second.empty())
+                    assert(i == coalescedReq_cnt);
+
+                sent_probes++;
+                if (sent_probes == coalescer->TLBProbesPerCycle)
+                   return;
+            }
+        }
+
+        //if there are no more coalesced reqs for this tick_index
+        //erase the hash_map with the first iterator
+        if (iter->second.empty()) {
+            coalescer->coalescerFIFO.erase(iter++);
+        } else {
+            ++iter;
+        }
+    }
+}
+
+TLBCoalescer::CleanupEvent::CleanupEvent(TLBCoalescer* _coalescer)
+    : Event(Maximum_Pri), coalescer(_coalescer)
+{
+}
+
+const char*
+TLBCoalescer::CleanupEvent::description() const
+{
+    return "Cleanup issuedTranslationsTable hashmap";
+}
+
+void
+TLBCoalescer::CleanupEvent::process()
+{
+    while (!coalescer->cleanupQueue.empty()) {
+        Addr cleanup_addr = coalescer->cleanupQueue.front();
+        coalescer->cleanupQueue.pop();
+        coalescer->issuedTranslationsTable.erase(cleanup_addr);
+
+        DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
+                cleanup_addr);
+    }
+}
+
+void
+TLBCoalescer::regStats()
+{
+    uncoalescedAccesses
+        .name(name() + ".uncoalesced_accesses")
+        .desc("Number of uncoalesced TLB accesses")
+        ;
+
+    coalescedAccesses
+        .name(name() + ".coalesced_accesses")
+        .desc("Number of coalesced TLB accesses")
+        ;
+
+    queuingCycles
+        .name(name() + ".queuing_cycles")
+        .desc("Number of cycles spent in queue")
+        ;
+
+    localqueuingCycles
+        .name(name() + ".local_queuing_cycles")
+        .desc("Number of cycles spent in queue for all incoming reqs")
+        ;
+
+    localLatency
+        .name(name() + ".local_latency")
+        .desc("Avg. latency over all incoming pkts")
+        ;
+
+    localLatency = localqueuingCycles / uncoalescedAccesses;
+}
+
+
+TLBCoalescer*
+TLBCoalescerParams::create()
+{
+    return new TLBCoalescer(this);
+}
+
diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/gpu-compute/tlb_coalescer.hh
new file mode 100644
index 000000000..09210148b
--- /dev/null
+++ b/src/gpu-compute/tlb_coalescer.hh
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#ifndef __TLB_COALESCER_HH__
+#define __TLB_COALESCER_HH__
+
+#include <list>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "arch/generic/tlb.hh"
+#include "arch/isa.hh"
+#include "arch/isa_traits.hh"
+#include "arch/x86/pagetable.hh"
+#include "arch/x86/regs/segment.hh"
+#include "base/misc.hh"
+#include "base/statistics.hh"
+#include "gpu-compute/gpu_tlb.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/TLBCoalescer.hh"
+
+class BaseTLB;
+class Packet;
+class ThreadContext;
+
+/**
+ * The TLBCoalescer is a MemObject sitting on the front side (CPUSide) of
+ * each TLB. It receives packets and issues coalesced requests to the
+ * TLB below it. It controls how requests are coalesced (the rules)
+ * and the permitted number of TLB probes per cycle (i.e., how many
+ * coalesced requests it feeds the TLB per cycle).
+ */
+class TLBCoalescer : public MemObject
+{
+   protected:
+    // TLB clock: will inherit clock from shader's clock period in terms
+    // of nuber of ticks of curTime (aka global simulation clock)
+    // The assignment of TLB clock from shader clock is done in the
+    // python config files.
+    int clock;
+
+  public:
+    typedef TLBCoalescerParams Params;
+    TLBCoalescer(const Params *p);
+    ~TLBCoalescer() { }
+
+    // Number of TLB probes per cycle. Parameterizable - default 2.
+    int TLBProbesPerCycle;
+
+    // Consider coalescing across that many ticks.
+    // Paraemterizable - default 1.
+    int coalescingWindow;
+
+    // Each coalesced request consists of multiple packets
+    // that all fall within the same virtual page
+    typedef std::vector<PacketPtr> coalescedReq;
+
+    // disables coalescing when true
+    bool disableCoalescing;
+
+    /*
+     * This is a hash map with <tick_index> as a key.
+     * It contains a vector of coalescedReqs per <tick_index>.
+     * Requests are buffered here until they can be issued to
+     * the TLB, at which point they are copied to the
+     * issuedTranslationsTable hash map.
+     *
+     * In terms of coalescing, we coalesce requests in a given
+     * window of x cycles by using tick_index = issueTime/x as a
+     * key, where x = coalescingWindow. issueTime is the issueTime
+     * of the pkt from the ComputeUnit's perspective, but another
+     * option is to change it to curTick(), so we coalesce based
+     * on the receive time.
+     */
+    typedef std::unordered_map<int64_t, std::vector<coalescedReq>> CoalescingFIFO;
+
+    CoalescingFIFO coalescerFIFO;
+
+    /*
+     * issuedTranslationsTabler: a hash_map indexed by virtual page
+     * address. Each hash_map entry has a vector of PacketPtr associated
+     * with it denoting the different packets that share an outstanding
+     * coalesced translation request for the same virtual page.
+     *
+     * The rules that determine which requests we can coalesce are
+     * specified in the canCoalesce() method.
+     */
+    typedef std::unordered_map<Addr, coalescedReq> CoalescingTable;
+
+    CoalescingTable issuedTranslationsTable;
+
+    // number of packets the coalescer receives
+    Stats::Scalar uncoalescedAccesses;
+    // number packets the coalescer send to the TLB
+    Stats::Scalar coalescedAccesses;
+
+    // Number of cycles the coalesced requests spend waiting in
+    // coalescerFIFO. For each packet the coalescer receives we take into
+    // account the number of all uncoalesced requests this pkt "represents"
+    Stats::Scalar queuingCycles;
+
+    // On average how much time a request from the
+    // uncoalescedAccesses that reaches the TLB
+    // spends waiting?
+    Stats::Scalar localqueuingCycles;
+    // localqueuingCycles/uncoalescedAccesses
+    Stats::Formula localLatency;
+
+    bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2);
+    void updatePhysAddresses(PacketPtr pkt);
+    void regStats();
+
+    // Clock related functions. Maps to-and-from
+    // Simulation ticks and object clocks.
+    Tick frequency() const { return SimClock::Frequency / clock; }
+    Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
+    Tick curCycle() const { return curTick() / clock; }
+    Tick tickToCycles(Tick val) const { return val / clock;}
+
+    class CpuSidePort : public SlavePort
+    {
+      public:
+        CpuSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer,
+                    PortID _index)
+            : SlavePort(_name, tlb_coalescer), coalescer(tlb_coalescer),
+              index(_index) { }
+
+      protected:
+        TLBCoalescer *coalescer;
+        int index;
+
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+
+        virtual void
+        recvRespRetry()
+        {
+            fatal("recvRespRetry() is not implemented in the TLB coalescer.\n");
+        }
+
+        virtual AddrRangeList getAddrRanges() const;
+    };
+
+    class MemSidePort : public MasterPort
+    {
+      public:
+        MemSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer,
+                    PortID _index)
+            : MasterPort(_name, tlb_coalescer), coalescer(tlb_coalescer),
+              index(_index) { }
+
+        std::deque<PacketPtr> retries;
+
+      protected:
+        TLBCoalescer *coalescer;
+        int index;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+
+        virtual void
+        recvRespRetry()
+        {
+            fatal("recvRespRetry() not implemented in TLB coalescer");
+        }
+    };
+
+    // Coalescer slave ports on the cpu Side
+    std::vector<CpuSidePort*> cpuSidePort;
+    // Coalescer master ports on the memory side
+    std::vector<MemSidePort*> memSidePort;
+
+    BaseMasterPort& getMasterPort(const std::string &if_name, PortID idx);
+    BaseSlavePort& getSlavePort(const std::string &if_name, PortID idx);
+
+    class IssueProbeEvent : public Event
+    {
+      private:
+        TLBCoalescer *coalescer;
+
+      public:
+        IssueProbeEvent(TLBCoalescer *_coalescer);
+        void process();
+        const char *description() const;
+    };
+
+    // this event issues the TLB probes
+    IssueProbeEvent probeTLBEvent;
+
+    // the cleanupEvent is scheduled after a TLBEvent triggers
+    // in order to free memory and do the required clean-up
+    class CleanupEvent : public Event
+    {
+      private:
+        TLBCoalescer *coalescer;
+
+      public:
+        CleanupEvent(TLBCoalescer *_coalescer);
+        void process();
+        const char* description() const;
+     };
+
+    // schedule cleanup
+    CleanupEvent cleanupEvent;
+
+    // this FIFO queue keeps track of the virt. page
+    // addresses that are pending cleanup
+    std::queue<Addr> cleanupQueue;
+};
+
+#endif // __TLB_COALESCER_HH__
diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc
new file mode 100644
index 000000000..8b7dc0691
--- /dev/null
+++ b/src/gpu-compute/vector_register_file.cc
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/vector_register_file.hh"
+
+#include <string>
+
+#include "base/misc.hh"
+#include "gpu-compute/code_enums.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/simple_pool_manager.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/VectorRegisterFile.hh"
+
+VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p)
+    : SimObject(p),
+      manager(new SimplePoolManager(p->min_alloc, p->num_regs_per_simd)),
+      simdId(p->simd_id), numRegsPerSimd(p->num_regs_per_simd),
+      vgprState(new VecRegisterState())
+{
+    fatal_if(numRegsPerSimd % 2, "VRF size is illegal\n");
+    fatal_if(simdId < 0, "Illegal SIMD id for VRF");
+
+    fatal_if(numRegsPerSimd % p->min_alloc, "Min VGPR region allocation is not "
+             "multiple of VRF size\n");
+
+    busy.clear();
+    busy.resize(numRegsPerSimd, 0);
+    nxtBusy.clear();
+    nxtBusy.resize(numRegsPerSimd, 0);
+
+    vgprState->init(numRegsPerSimd);
+}
+
+void
+VectorRegisterFile::setParent(ComputeUnit *_computeUnit)
+{
+    computeUnit = _computeUnit;
+    vgprState->setParent(computeUnit);
+}
+
+uint8_t
+VectorRegisterFile::regNxtBusy(int idx, uint32_t operandSize) const
+{
+    uint8_t status = nxtBusy.at(idx);
+
+    if (operandSize > 4) {
+        status = status | (nxtBusy.at((idx + 1) % numRegs()));
+    }
+
+    return status;
+}
+
+uint8_t
+VectorRegisterFile::regBusy(int idx, uint32_t operandSize) const
+{
+    uint8_t status = busy.at(idx);
+
+    if (operandSize > 4) {
+        status = status | (busy.at((idx + 1) % numRegs()));
+    }
+
+    return status;
+}
+
+void
+VectorRegisterFile::preMarkReg(int regIdx, uint32_t operandSize, uint8_t value)
+{
+    nxtBusy.at(regIdx) = value;
+
+    if (operandSize > 4) {
+        nxtBusy.at((regIdx + 1) % numRegs()) = value;
+    }
+}
+
+void
+VectorRegisterFile::markReg(int regIdx, uint32_t operandSize, uint8_t value)
+{
+    busy.at(regIdx) = value;
+
+    if (operandSize > 4) {
+        busy.at((regIdx + 1) % numRegs()) = value;
+    }
+}
+
+bool
+VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
+{
+    for (int i = 0; i < ii->getNumOperands(); ++i) {
+        if (ii->isVectorRegister(i)) {
+            uint32_t vgprIdx = ii->getRegisterIndex(i);
+            uint32_t pVgpr = w->remap(vgprIdx, ii->getOperandSize(i), 1);
+
+            if (regBusy(pVgpr, ii->getOperandSize(i)) == 1) {
+                if (ii->isDstOperand(i)) {
+                    w->numTimesBlockedDueWAXDependencies++;
+                } else if (ii->isSrcOperand(i)) {
+                    w->numTimesBlockedDueRAWDependencies++;
+                }
+
+                return false;
+            }
+
+            if (regNxtBusy(pVgpr, ii->getOperandSize(i)) == 1) {
+                if (ii->isDstOperand(i)) {
+                    w->numTimesBlockedDueWAXDependencies++;
+                } else if (ii->isSrcOperand(i)) {
+                    w->numTimesBlockedDueRAWDependencies++;
+                }
+
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+void
+VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w)
+{
+    bool loadInstr = IS_OT_READ(ii->opType());
+    bool atomicInstr = IS_OT_ATOMIC(ii->opType());
+
+    bool loadNoArgInstr = loadInstr && !ii->isArgLoad();
+
+    // iterate over all register destination operands
+    for (int i = 0; i < ii->getNumOperands(); ++i) {
+        if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
+            uint32_t physReg = w->remap(ii->getRegisterIndex(i),
+                                        ii->getOperandSize(i), 1);
+
+            // mark the destination vector register as busy
+            markReg(physReg, ii->getOperandSize(i), 1);
+            // clear the in-flight status of the destination vector register
+            preMarkReg(physReg, ii->getOperandSize(i), 0);
+
+            // FIXME: if we ever model correct timing behavior
+            // for load argument instructions then we should not
+            // set the destination register as busy now but when
+            // the data returns. Loads and Atomics should free
+            // their destination registers when the data returns,
+            // not now
+            if (!atomicInstr && !loadNoArgInstr) {
+                uint32_t pipeLen = ii->getOperandSize(i) <= 4 ?
+                    computeUnit->spBypassLength() :
+                    computeUnit->dpBypassLength();
+
+                // schedule an event for marking the register as ready
+                computeUnit->registerEvent(w->simdId, physReg,
+                                           ii->getOperandSize(i),
+                                           computeUnit->shader->tick_cnt +
+                                           computeUnit->shader->ticks(pipeLen),
+                                           0);
+            }
+        }
+    }
+}
+
+int
+VectorRegisterFile::exec(uint64_t dynamic_id, Wavefront *w,
+                         std::vector<uint32_t> &regVec, uint32_t operandSize,
+                         uint64_t timestamp)
+{
+    int delay = 0;
+
+    panic_if(regVec.size() <= 0, "Illegal VGPR vector size=%d\n",
+             regVec.size());
+
+    for (int i = 0; i < regVec.size(); ++i) {
+        // mark the destination VGPR as free when the timestamp expires
+        computeUnit->registerEvent(w->simdId, regVec[i], operandSize,
+                                   computeUnit->shader->tick_cnt + timestamp +
+                                   computeUnit->shader->ticks(delay), 0);
+    }
+
+    return delay;
+}
+
+void
+VectorRegisterFile::updateResources(Wavefront *w, GPUDynInstPtr ii)
+{
+    // iterate over all register destination operands
+    for (int i = 0; i < ii->getNumOperands(); ++i) {
+        if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
+            uint32_t physReg = w->remap(ii->getRegisterIndex(i),
+                                        ii->getOperandSize(i), 1);
+            // set the in-flight status of the destination vector register
+            preMarkReg(physReg, ii->getOperandSize(i), 1);
+        }
+    }
+}
+
+bool
+VectorRegisterFile::vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
+                                          GPUDynInstPtr ii,
+                                          VrfAccessType accessType)
+{
+    bool ready = true;
+
+    return ready;
+}
+
+bool
+VectorRegisterFile::vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
+                                          VrfAccessType accessType)
+{
+    bool ready = true;
+
+    return ready;
+}
+
+VectorRegisterFile*
+VectorRegisterFileParams::create()
+{
+    return new VectorRegisterFile(this);
+}
diff --git a/src/gpu-compute/vector_register_file.hh b/src/gpu-compute/vector_register_file.hh
new file mode 100644
index 000000000..1cb011a1e
--- /dev/null
+++ b/src/gpu-compute/vector_register_file.hh
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __VECTOR_REGISTER_FILE_HH__
+#define __VECTOR_REGISTER_FILE_HH__
+
+#include <list>
+
+#include "base/statistics.hh"
+#include "base/types.hh"
+#include "gpu-compute/vector_register_state.hh"
+#include "sim/sim_object.hh"
+
+class ComputeUnit;
+class Shader;
+class SimplePoolManager;
+class Wavefront;
+
+struct VectorRegisterFileParams;
+
+enum class VrfAccessType : uint8_t
+{
+    READ = 0x01,
+    WRITE = 0x02,
+    RD_WR = READ | WRITE
+};
+
+// Vector Register File
+class VectorRegisterFile : public SimObject
+{
+  public:
+    VectorRegisterFile(const VectorRegisterFileParams *p);
+
+    void setParent(ComputeUnit *_computeUnit);
+
+    // Read a register
+    template<typename T>
+    T
+    read(int regIdx, int threadId=0)
+    {
+        T p0 = vgprState->read<T>(regIdx, threadId);
+
+        return p0;
+    }
+
+    // Write a register
+    template<typename T>
+    void
+    write(int regIdx, T value, int threadId=0)
+    {
+        vgprState->write<T>(regIdx, value, threadId);
+    }
+
+    uint8_t regBusy(int idx, uint32_t operandSize) const;
+    uint8_t regNxtBusy(int idx, uint32_t operandSize) const;
+
+    int numRegs() const { return numRegsPerSimd; }
+
+    void markReg(int regIdx, uint32_t operandSize, uint8_t value);
+    void preMarkReg(int regIdx, uint32_t operandSize, uint8_t value);
+
+    virtual void exec(GPUDynInstPtr ii, Wavefront *w);
+
+    virtual int exec(uint64_t dynamic_id, Wavefront *w,
+                     std::vector<uint32_t> &regVec, uint32_t operandSize,
+                     uint64_t timestamp);
+
+    bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const;
+    virtual void updateEvents() { }
+    virtual void updateResources(Wavefront *w, GPUDynInstPtr ii);
+
+    virtual bool
+    isReadConflict(int memWfId, int exeWfId) const
+    {
+        return false;
+    }
+
+    virtual bool
+    isWriteConflict(int memWfId, int exeWfId) const
+    {
+        return false;
+    }
+
+    virtual bool vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
+                                       GPUDynInstPtr ii,
+                                       VrfAccessType accessType);
+
+    virtual bool vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
+                                       VrfAccessType accessType);
+
+    SimplePoolManager *manager;
+
+  protected:
+    ComputeUnit* computeUnit;
+    int simdId;
+
+    // flag indicating if a register is busy
+    std::vector<uint8_t> busy;
+    // flag indicating if a register will be busy (by instructions
+    // in the SIMD pipeline)
+    std::vector<uint8_t> nxtBusy;
+
+    // numer of registers (bank size) per simd unit (bank)
+    int numRegsPerSimd;
+
+    // vector register state
+    VecRegisterState *vgprState;
+};
+
+#endif // __VECTOR_REGISTER_FILE_HH__
diff --git a/src/gpu-compute/vector_register_state.cc b/src/gpu-compute/vector_register_state.cc
new file mode 100644
index 000000000..f231b0579
--- /dev/null
+++ b/src/gpu-compute/vector_register_state.cc
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/vector_register_state.hh"
+
+#include "gpu-compute/compute_unit.hh"
+
+VecRegisterState::VecRegisterState() : computeUnit(nullptr)
+{
+    s_reg.clear();
+    d_reg.clear();
+}
+
+void
+VecRegisterState::setParent(ComputeUnit *_computeUnit)
+{
+    computeUnit = _computeUnit;
+    _name = computeUnit->name() + ".VecRegState";
+}
+
+void
+VecRegisterState::init(uint32_t _size)
+{
+    s_reg.resize(_size);
+    d_reg.resize(_size);
+}
diff --git a/src/gpu-compute/vector_register_state.hh b/src/gpu-compute/vector_register_state.hh
new file mode 100644
index 000000000..a233b9acc
--- /dev/null
+++ b/src/gpu-compute/vector_register_state.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __VECTOR_REGISTER_STATE_HH__
+#define __VECTOR_REGISTER_STATE_HH__
+
+#include <array>
+#include <cassert>
+#include <string>
+#include <vector>
+
+#include "gpu-compute/misc.hh"
+
+class ComputeUnit;
+
+// Vector Register State per SIMD unit (contents of the vector
+// registers in the VRF of the SIMD)
+class VecRegisterState
+{
+  public:
+    VecRegisterState();
+    void init(uint32_t _size);
+
+    const std::string& name() const { return _name; }
+    void setParent(ComputeUnit *_computeUnit);
+    void regStats() { }
+
+    // Access methods
+    template<typename T>
+    T
+    read(int regIdx, int threadId=0) {
+        T *p0;
+        assert(sizeof(T) == 4 || sizeof(T) == 8);
+        if (sizeof(T) == 4) {
+            p0 = (T*)(&s_reg[regIdx][threadId]);
+        } else {
+            p0 = (T*)(&d_reg[regIdx][threadId]);
+        }
+
+        return *p0;
+    }
+
+    template<typename T>
+    void
+    write(unsigned int regIdx, T value, int threadId=0) {
+        T *p0;
+        assert(sizeof(T) == 4 || sizeof(T) == 8);
+        if (sizeof(T) == 4) {
+            p0 = (T*)(&s_reg[regIdx][threadId]);
+        } else {
+            p0 = (T*)(&d_reg[regIdx][threadId]);
+        }
+
+        *p0 = value;
+    }
+
+    // (Single Precision) Vector Register File size.
+    int regSize() { return s_reg.size(); }
+
+  private:
+    ComputeUnit *computeUnit;
+    std::string _name;
+    // 32-bit Single Precision Vector Register State
+    std::vector<std::array<uint32_t, VSZ>> s_reg;
+    // 64-bit Double Precision Vector Register State
+    std::vector<std::array<uint64_t, VSZ>> d_reg;
+};
+
+#endif // __VECTOR_REGISTER_STATE_HH__
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
new file mode 100644
index 000000000..0aa033db1
--- /dev/null
+++ b/src/gpu-compute/wavefront.cc
@@ -0,0 +1,925 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#include "gpu-compute/wavefront.hh"
+
+#include "debug/GPUExec.hh"
+#include "debug/WavefrontStack.hh"
+#include "gpu-compute/code_enums.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+
+Wavefront*
+WavefrontParams::create()
+{
+    return new Wavefront(this);
+}
+
+Wavefront::Wavefront(const Params *p)
+  : SimObject(p), callArgMem(nullptr)
+{
+    last_trace = 0;
+    simdId = p->simdId;
+    wfSlotId = p->wf_slot_id;
+
+    status = S_STOPPED;
+    reservedVectorRegs = 0;
+    startVgprIndex = 0;
+    outstanding_reqs = 0;
+    mem_reqs_in_pipe = 0;
+    outstanding_reqs_wr_gm = 0;
+    outstanding_reqs_wr_lm = 0;
+    outstanding_reqs_rd_gm = 0;
+    outstanding_reqs_rd_lm = 0;
+    rd_lm_reqs_in_pipe = 0;
+    rd_gm_reqs_in_pipe = 0;
+    wr_lm_reqs_in_pipe = 0;
+    wr_gm_reqs_in_pipe = 0;
+
+    barrier_cnt = 0;
+    old_barrier_cnt = 0;
+    stalledAtBarrier = false;
+
+    mem_trace_busy = 0;
+    old_vgpr_tcnt = 0xffffffffffffffffll;
+    old_dgpr_tcnt = 0xffffffffffffffffll;
+
+    pendingFetch = false;
+    dropFetch = false;
+    condRegState = new ConditionRegisterState();
+    maxSpVgprs = 0;
+    maxDpVgprs = 0;
+}
+
+void
+Wavefront::regStats()
+{
+    srcRegOpDist
+        .init(0, 4, 2)
+        .name(name() + ".src_reg_operand_dist")
+        .desc("number of executed instructions with N source register operands")
+        ;
+
+    dstRegOpDist
+        .init(0, 3, 2)
+        .name(name() + ".dst_reg_operand_dist")
+        .desc("number of executed instructions with N destination register "
+              "operands")
+        ;
+
+    // FIXME: the name of the WF needs to be unique
+    numTimesBlockedDueWAXDependencies
+        .name(name() + ".timesBlockedDueWAXDependencies")
+        .desc("number of times the wf's instructions are blocked due to WAW "
+              "or WAR dependencies")
+        ;
+
+    // FIXME: the name of the WF needs to be unique
+    numTimesBlockedDueRAWDependencies
+        .name(name() + ".timesBlockedDueRAWDependencies")
+        .desc("number of times the wf's instructions are blocked due to RAW "
+              "dependencies")
+        ;
+
+    // FIXME: the name of the WF needs to be unique
+    numTimesBlockedDueVrfPortAvail
+        .name(name() + ".timesBlockedDueVrfPortAvail")
+        .desc("number of times instructions are blocked due to VRF port "
+              "availability")
+        ;
+}
+
+void
+Wavefront::init()
+{
+    reservedVectorRegs = 0;
+    startVgprIndex = 0;
+}
+
+void
+Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
+{
+    condRegState->init(num_cregs);
+    maxSpVgprs = num_sregs;
+    maxDpVgprs = num_dregs;
+}
+
+Wavefront::~Wavefront()
+{
+    if (callArgMem)
+        delete callArgMem;
+}
+
+void
+Wavefront::start(uint64_t _wfDynId,uint64_t _base_ptr)
+{
+    wfDynId = _wfDynId;
+    base_ptr = _base_ptr;
+    status = S_RUNNING;
+}
+
+bool
+Wavefront::isGmInstruction(GPUDynInstPtr ii)
+{
+    if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
+        IS_OT_ATOMIC_PM(ii->opType())) {
+        return true;
+    }
+
+    if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
+        IS_OT_ATOMIC_GM(ii->opType())) {
+
+        return true;
+    }
+
+    if (IS_OT_FLAT(ii->opType())) {
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isLmInstruction(GPUDynInstPtr ii)
+{
+    if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) ||
+        IS_OT_ATOMIC_LM(ii->opType())) {
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstALU()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP ||
+        ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
+        ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
+        ii->opType() == Enums::OT_KERN_READ)) {
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstBarrier()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) {
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstGMem()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) ||
+        IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
+
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstLMem()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) ||
+        IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
+
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstPrivMem()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) ||
+        IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
+
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstFlatMem()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) {
+
+        return true;
+    }
+
+    return false;
+}
+
+// Return true if the Wavefront's instruction
+// buffer has branch instruction.
+bool
+Wavefront::instructionBufferHasBranch()
+{
+    for (auto it : instructionBuffer) {
+        GPUDynInstPtr ii = it;
+
+        if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+// Remap HSAIL register to physical VGPR.
+// HSAIL register = virtual register assigned to an operand by HLC compiler
+uint32_t
+Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
+{
+    assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
+    // add the offset from where the VGPRs of the wavefront have been assigned
+    uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
+    // HSAIL double precision (DP) register: calculate the physical VGPR index
+    // assuming that DP registers are placed after SP ones in the VRF. The DP
+    // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
+    // the DP VGPR index before mapping it to the physical VRF address space
+    if (mode == 1 && size > 4) {
+        physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
+    }
+
+    assert((startVgprIndex <= physicalVgprIndex) &&
+           (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
+
+    // calculate absolute physical VGPR index
+    return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
+}
+
+// Return true if this wavefront is ready
+// to execute an instruction of the specified type.
+int
+Wavefront::ready(itype_e type)
+{
+    // Check to make sure wave is running
+    if (status == S_STOPPED || status == S_RETURNING ||
+        instructionBuffer.empty()) {
+        return 0;
+    }
+
+    // Is the wave waiting at a barrier
+    if (stalledAtBarrier) {
+        if (!computeUnit->AllAtBarrier(barrier_id,barrier_cnt,
+                        computeUnit->getRefCounter(dispatchid, wg_id))) {
+            // Are all threads at barrier?
+            return 0;
+        }
+        old_barrier_cnt = barrier_cnt;
+        stalledAtBarrier = false;
+    }
+
+    // Read instruction
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    bool ready_inst M5_VAR_USED = false;
+    bool glbMemBusRdy = false;
+    bool glbMemIssueRdy = false;
+    if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
+        for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
+            if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
+                glbMemBusRdy = true;
+            if (computeUnit->wfWait[j].prerdy())
+                glbMemIssueRdy = true;
+        }
+    }
+    bool locMemBusRdy = false;
+    bool locMemIssueRdy = false;
+    if (type == I_SHARED) {
+        for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
+            if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
+                locMemBusRdy = true;
+            if (computeUnit->wfWait[j].prerdy())
+                locMemIssueRdy = true;
+        }
+    }
+
+    // The following code is very error prone and the entire process for
+    // checking readiness will be fixed eventually.  In the meantime, let's
+    // make sure that we do not silently let an instruction type slip
+    // through this logic and always return not ready.
+    if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP ||
+          ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
+          ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
+          ii->opType() == Enums::OT_KERN_READ ||
+          ii->opType() == Enums::OT_ARG ||
+          IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
+          IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) ||
+          IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
+          IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
+          IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) {
+        panic("next instruction: %s is of unknown type\n", ii->disassemble());
+    }
+
+    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
+            computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
+
+    if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) {
+        // Here for ALU instruction (barrier)
+        if (!computeUnit->wfWait[simdId].prerdy()) {
+            // Is wave slot free?
+            return 0;
+        }
+
+        // Are there in pipe or outstanding memory requests?
+        if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
+            return 0;
+        }
+
+        ready_inst = true;
+    } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) {
+        // Here for ALU instruction (nop)
+        if (!computeUnit->wfWait[simdId].prerdy()) {
+            // Is wave slot free?
+            return 0;
+        }
+
+        ready_inst = true;
+    } else if (type == I_ALU && ii->opType() == Enums::OT_RET) {
+        // Here for ALU instruction (return)
+        if (!computeUnit->wfWait[simdId].prerdy()) {
+            // Is wave slot free?
+            return 0;
+        }
+
+        // Are there in pipe or outstanding memory requests?
+        if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
+            return 0;
+        }
+
+        ready_inst = true;
+    } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH ||
+               ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
+               ii->opType() == Enums::OT_KERN_READ ||
+               ii->opType() == Enums::OT_ARG)) {
+        // Here for ALU instruction (all others)
+        if (!computeUnit->wfWait[simdId].prerdy()) {
+            // Is alu slot free?
+            return 0;
+        }
+        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+                    VrfAccessType::RD_WR)) {
+            return 0;
+        }
+
+        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+            return 0;
+        }
+        ready_inst = true;
+    } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) ||
+               IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
+        // Here Global memory instruction
+        if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) {
+            // Are there in pipe or outstanding global memory write requests?
+            if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
+                return 0;
+            }
+        }
+
+        if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) ||
+            IS_OT_HIST_GM(ii->opType())) {
+            // Are there in pipe or outstanding global memory read requests?
+            if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0)
+                return 0;
+        }
+
+        if (!glbMemIssueRdy) {
+            // Is WV issue slot free?
+            return 0;
+        }
+
+        if (!glbMemBusRdy) {
+            // Is there an available VRF->Global memory read bus?
+            return 0;
+        }
+
+        if (!computeUnit->globalMemoryPipe.
+            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
+            // Can we insert a new request to the Global Mem Request FIFO?
+            return 0;
+        }
+        // can we schedule source & destination operands on the VRF?
+        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+                    VrfAccessType::RD_WR)) {
+            return 0;
+        }
+        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+            return 0;
+        }
+        ready_inst = true;
+    } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) ||
+               IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
+        // Here for Shared memory instruction
+        if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) {
+            if ((outstanding_reqs_wr_lm + wr_lm_reqs_in_pipe) > 0) {
+                return 0;
+            }
+        }
+
+        if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
+            IS_OT_HIST_LM(ii->opType())) {
+            if ((outstanding_reqs_rd_lm + rd_lm_reqs_in_pipe) > 0) {
+                return 0;
+            }
+        }
+
+        if (!locMemBusRdy) {
+            // Is there an available VRF->LDS read bus?
+            return 0;
+        }
+        if (!locMemIssueRdy) {
+            // Is wave slot free?
+            return 0;
+        }
+
+        if (!computeUnit->localMemoryPipe.
+            isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
+            // Can we insert a new request to the LDS Request FIFO?
+            return 0;
+        }
+        // can we schedule source & destination operands on the VRF?
+        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+                    VrfAccessType::RD_WR)) {
+            return 0;
+        }
+        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+            return 0;
+        }
+        ready_inst = true;
+    } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) ||
+               IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
+        // Here for Private memory instruction ------------------------    //
+        if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) {
+            if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
+                return 0;
+            }
+        }
+
+        if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) ||
+            IS_OT_HIST_PM(ii->opType())) {
+            if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) {
+                return 0;
+            }
+        }
+
+        if (!glbMemBusRdy) {
+            // Is there an available VRF->Global memory read bus?
+            return 0;
+        }
+
+        if (!glbMemIssueRdy) {
+             // Is wave slot free?
+            return 0;
+        }
+
+        if (!computeUnit->globalMemoryPipe.
+            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
+            // Can we insert a new request to the Global Mem Request FIFO?
+            return 0;
+        }
+        // can we schedule source & destination operands on the VRF?
+        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+                    VrfAccessType::RD_WR)) {
+            return 0;
+        }
+        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+            return 0;
+        }
+        ready_inst = true;
+    } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) {
+        if (!glbMemBusRdy) {
+            // Is there an available VRF->Global memory read bus?
+            return 0;
+        }
+
+        if (!locMemBusRdy) {
+            // Is there an available VRF->LDS read bus?
+            return 0;
+        }
+
+        if (!glbMemIssueRdy) {
+            // Is wave slot free?
+            return 0;
+        }
+
+        if (!locMemIssueRdy) {
+            return 0;
+        }
+        if (!computeUnit->globalMemoryPipe.
+            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
+            // Can we insert a new request to the Global Mem Request FIFO?
+            return 0;
+        }
+
+        if (!computeUnit->localMemoryPipe.
+            isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
+            // Can we insert a new request to the LDS Request FIFO?
+            return 0;
+        }
+        // can we schedule source & destination operands on the VRF?
+        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+                    VrfAccessType::RD_WR)) {
+            return 0;
+        }
+        // are all the operands ready? (RAW, WAW and WAR depedencies met?)
+        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+            return 0;
+        }
+        ready_inst = true;
+    } else {
+        return 0;
+    }
+
+    assert(ready_inst);
+
+    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
+            simdId, wfSlotId, ii->disassemble());
+
+    return 1;
+}
+
+void
+Wavefront::updateResources()
+{
+    // Get current instruction
+    GPUDynInstPtr ii = instructionBuffer.front();
+    assert(ii);
+    computeUnit->vrf[simdId]->updateResources(this, ii);
+    // Single precision ALU or Branch or Return or Special instruction
+    if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
+        ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
+        // FIXME: Kernel argument loads are currently treated as ALU operations
+        // since we don't send memory packets at execution. If we fix that then
+        // we should map them to one of the memory pipelines
+        ii->opType()==Enums::OT_KERN_READ ||
+        ii->opType()==Enums::OT_ARG ||
+        ii->opType()==Enums::OT_RET) {
+        computeUnit->aluPipe[simdId].preset(computeUnit->shader->
+                                            ticks(computeUnit->spBypassLength()));
+        // this is to enforce a fixed number of cycles per issue slot per SIMD
+        computeUnit->wfWait[simdId].preset(computeUnit->shader->
+                                           ticks(computeUnit->issuePeriod));
+    } else if (ii->opType() == Enums::OT_BARRIER) {
+        computeUnit->wfWait[simdId].preset(computeUnit->shader->
+                                           ticks(computeUnit->issuePeriod));
+    } else if (ii->opType() == Enums::OT_FLAT_READ) {
+        assert(Enums::SC_NONE != ii->executedAs());
+        mem_reqs_in_pipe++;
+        rd_gm_reqs_in_pipe++;
+        if ( Enums::SC_SHARED == ii->executedAs() ) {
+            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+                preset(computeUnit->shader->ticks(4));
+            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        } else {
+            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+                preset(computeUnit->shader->ticks(4));
+            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        }
+    } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
+        assert(Enums::SC_NONE != ii->executedAs());
+        mem_reqs_in_pipe++;
+        wr_gm_reqs_in_pipe++;
+        if (Enums::SC_SHARED == ii->executedAs()) {
+            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+                preset(computeUnit->shader->ticks(8));
+            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        } else {
+            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+                preset(computeUnit->shader->ticks(8));
+            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        }
+    } else if (IS_OT_READ_GM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        rd_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(4));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_WRITE_GM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_ATOMIC_GM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_gm_reqs_in_pipe++;
+        rd_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_READ_LM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        rd_lm_reqs_in_pipe++;
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            preset(computeUnit->shader->ticks(4));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_WRITE_LM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_lm_reqs_in_pipe++;
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_ATOMIC_LM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_lm_reqs_in_pipe++;
+        rd_lm_reqs_in_pipe++;
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_READ_PM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        rd_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(4));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_WRITE_PM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_ATOMIC_PM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_gm_reqs_in_pipe++;
+        rd_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    }
+}
+
+void
+Wavefront::exec()
+{
+    // ---- Exit if wavefront is inactive ----------------------------- //
+
+    if (status == S_STOPPED || status == S_RETURNING ||
+        instructionBuffer.empty()) {
+        return;
+    }
+
+    // Get current instruction
+
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    const uint32_t old_pc = pc();
+    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
+            "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
+            ii->disassemble(), old_pc);
+    ii->execute();
+    // access the VRF
+    computeUnit->vrf[simdId]->exec(ii, this);
+    srcRegOpDist.sample(ii->numSrcRegOperands());
+    dstRegOpDist.sample(ii->numDstRegOperands());
+    computeUnit->numInstrExecuted++;
+    computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
+                                     computeUnit->lastExecCycle[simdId]);
+    computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
+    if (pc() == old_pc) {
+        uint32_t new_pc = old_pc + 1;
+        // PC not modified by instruction, proceed to next or pop frame
+        pc(new_pc);
+        if (new_pc == rpc()) {
+            popFromReconvergenceStack();
+            discardFetch();
+        } else {
+            instructionBuffer.pop_front();
+        }
+    }
+
+    if (computeUnit->shader->hsail_mode==Shader::SIMT) {
+        const int num_active_lanes = execMask().count();
+        computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
+        computeUnit->numVecOpsExecuted += num_active_lanes;
+        if (isGmInstruction(ii)) {
+            computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
+        } else if (isLmInstruction(ii)) {
+            computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
+        }
+    }
+
+    // ---- Update Vector ALU pipeline and other resources ------------------ //
+    // Single precision ALU or Branch or Return or Special instruction
+    if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
+        ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
+        // FIXME: Kernel argument loads are currently treated as ALU operations
+        // since we don't send memory packets at execution. If we fix that then
+        // we should map them to one of the memory pipelines
+        ii->opType() == Enums::OT_KERN_READ ||
+        ii->opType() == Enums::OT_ARG ||
+        ii->opType() == Enums::OT_RET) {
+        computeUnit->aluPipe[simdId].set(computeUnit->shader->
+                                         ticks(computeUnit->spBypassLength()));
+
+        // this is to enforce a fixed number of cycles per issue slot per SIMD
+        computeUnit->wfWait[simdId].set(computeUnit->shader->
+                                        ticks(computeUnit->issuePeriod));
+    } else if (ii->opType() == Enums::OT_BARRIER) {
+        computeUnit->wfWait[simdId].set(computeUnit->shader->
+                                        ticks(computeUnit->issuePeriod));
+    } else if (ii->opType() == Enums::OT_FLAT_READ) {
+        assert(Enums::SC_NONE != ii->executedAs());
+
+        if (Enums::SC_SHARED == ii->executedAs()) {
+            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+                set(computeUnit->shader->ticks(4));
+            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        } else {
+            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+                set(computeUnit->shader->ticks(4));
+            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        }
+    } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
+        assert(Enums::SC_NONE != ii->executedAs());
+        if (Enums::SC_SHARED == ii->executedAs()) {
+            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+                set(computeUnit->shader->ticks(8));
+            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        } else {
+            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+                set(computeUnit->shader->ticks(8));
+            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        }
+    } else if (IS_OT_READ_GM(ii->opType())) {
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            set(computeUnit->shader->ticks(4));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_WRITE_GM(ii->opType())) {
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            set(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_ATOMIC_GM(ii->opType())) {
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            set(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_READ_LM(ii->opType())) {
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            set(computeUnit->shader->ticks(4));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_WRITE_LM(ii->opType())) {
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            set(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_ATOMIC_LM(ii->opType())) {
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            set(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    }
+}
+
+bool
+Wavefront::waitingAtBarrier(int lane)
+{
+    return bar_cnt[lane] < max_bar_cnt;
+}
+
+void
+Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
+                                    const VectorMask& mask)
+{
+    assert(mask.count());
+    reconvergenceStack.emplace(new ReconvergenceStackEntry(pc, rpc, mask));
+}
+
+void
+Wavefront::popFromReconvergenceStack()
+{
+    assert(!reconvergenceStack.empty());
+
+    DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
+            computeUnit->cu_id, simdId, wfSlotId, wfDynId,
+            execMask().to_string<char, std::string::traits_type,
+            std::string::allocator_type>().c_str(), pc());
+
+    reconvergenceStack.pop();
+
+    DPRINTF(WavefrontStack, "%3i %s\n", pc(),
+            execMask().to_string<char, std::string::traits_type,
+            std::string::allocator_type>().c_str());
+
+}
+
+void
+Wavefront::discardFetch()
+{
+    instructionBuffer.clear();
+    dropFetch |=pendingFetch;
+}
+
+uint32_t
+Wavefront::pc() const
+{
+    return reconvergenceStack.top()->pc;
+}
+
+uint32_t
+Wavefront::rpc() const
+{
+    return reconvergenceStack.top()->rpc;
+}
+
+VectorMask
+Wavefront::execMask() const
+{
+    return reconvergenceStack.top()->execMask;
+}
+
+bool
+Wavefront::execMask(int lane) const
+{
+    return reconvergenceStack.top()->execMask[lane];
+}
+
+
+void
+Wavefront::pc(uint32_t new_pc)
+{
+    reconvergenceStack.top()->pc = new_pc;
+}
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
new file mode 100644
index 000000000..0abab8e83
--- /dev/null
+++ b/src/gpu-compute/wavefront.hh
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#ifndef __WAVEFRONT_HH__
+#define __WAVEFRONT_HH__
+
+#include <cassert>
+#include <deque>
+#include <memory>
+#include <stack>
+#include <vector>
+
+#include "base/misc.hh"
+#include "base/types.hh"
+#include "gpu-compute/condition_register_state.hh"
+#include "gpu-compute/lds_state.hh"
+#include "gpu-compute/misc.hh"
+#include "params/Wavefront.hh"
+#include "sim/sim_object.hh"
+
+static const int MAX_NUM_INSTS_PER_WF = 12;
+
+/*
+ * Arguments for the hsail opcode call, are user defined and variable length.
+ * The hardware/finalizer can support arguments in hardware or use memory to
+ * pass arguments. For now, let's assume that an unlimited number of arguments
+ * are supported in hardware (the compiler inlines functions whenver it can
+ * anyways, so unless someone is interested in the implications of linking/
+ * library functions, I think this is a reasonable assumption given the typical
+ * size of an OpenCL kernel).
+ *
+ * Note that call args are different than kernel arguments:
+ *   * All work-items in a kernel refer the same set of kernel arguments
+ *   * Each work-item has it's on set of call args. So a call argument at
+ *     address 0x4 is different for work-item 0 and work-item 1.
+ *
+ * Ok, the table below shows an example of how we organize the call arguments in
+ * the CallArgMem class.
+ *
+ * int foo(int arg1, double arg2)
+ *  ___________________________________________________
+ * | 0: return.0 | 4: return.1 | ... | 252: return.63  |
+ * |---------------------------------------------------|
+ * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63    |
+ * |---------------------------------------------------|
+ * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63   |
+ *  ___________________________________________________
+ */
+class CallArgMem
+{
+  public:
+    // pointer to buffer for storing function arguments
+    uint8_t *mem;
+    // size of function args
+    int funcArgsSizePerItem;
+
+    template<typename CType>
+    int
+    getLaneOffset(int lane, int addr)
+    {
+        return addr * VSZ + sizeof(CType) * lane;
+    }
+
+    CallArgMem(int func_args_size_per_item)
+      : funcArgsSizePerItem(func_args_size_per_item)
+    {
+        mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ);
+    }
+
+    ~CallArgMem()
+    {
+        free(mem);
+    }
+
+    template<typename CType>
+    uint8_t*
+    getLaneAddr(int lane, int addr)
+    {
+        return mem + getLaneOffset<CType>(lane, addr);
+    }
+
+    template<typename CType>
+    void
+    setLaneAddr(int lane, int addr, CType val)
+    {
+        *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
+    }
+};
+
+/**
+ * A reconvergence stack entry conveys the necessary state to implement
+ * control flow divergence.
+ */
+class ReconvergenceStackEntry {
+
+  public:
+    ReconvergenceStackEntry(uint32_t new_pc, uint32_t new_rpc,
+                            VectorMask new_mask) : pc(new_pc), rpc(new_rpc),
+                            execMask(new_mask) {
+    }
+
+    /**
+     * PC of current instruction.
+     */
+    uint32_t pc;
+    /**
+     * PC of the immediate post-dominator instruction, i.e., the value of
+     * @a pc for the first instruction that will be executed by the wavefront
+     * when a reconvergence point is reached.
+     */
+    uint32_t rpc;
+    /**
+     * Execution mask.
+     */
+    VectorMask execMask;
+};
+
+class Wavefront : public SimObject
+{
+  public:
+    enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
+    enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
+
+    // Base pointer for array of instruction pointers
+    uint64_t base_ptr;
+
+    uint32_t old_barrier_cnt;
+    uint32_t barrier_cnt;
+    uint32_t barrier_id;
+    uint32_t barrier_slots;
+    status_e status;
+    // HW slot id where the WF is mapped to inside a SIMD unit
+    int wfSlotId;
+    int kern_id;
+    // SIMD unit where the WV has been scheduled
+    int simdId;
+    // pointer to parent CU
+    ComputeUnit *computeUnit;
+
+    std::deque<GPUDynInstPtr> instructionBuffer;
+
+    bool pendingFetch;
+    bool dropFetch;
+
+    // Condition Register State (for HSAIL simulations only)
+    class ConditionRegisterState *condRegState;
+    // number of single precision VGPRs required by WF
+    uint32_t maxSpVgprs;
+    // number of double precision VGPRs required by WF
+    uint32_t maxDpVgprs;
+    // map virtual to physical vector register
+    uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
+    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+    bool isGmInstruction(GPUDynInstPtr ii);
+    bool isLmInstruction(GPUDynInstPtr ii);
+    bool isOldestInstGMem();
+    bool isOldestInstLMem();
+    bool isOldestInstPrivMem();
+    bool isOldestInstFlatMem();
+    bool isOldestInstALU();
+    bool isOldestInstBarrier();
+    // used for passing spill address to DDInstGPU
+    uint64_t last_addr[VSZ];
+    uint32_t workitemid[3][VSZ];
+    uint32_t workitemFlatId[VSZ];
+    uint32_t workgroupid[3];
+    uint32_t workgroupsz[3];
+    uint32_t gridsz[3];
+    uint32_t wg_id;
+    uint32_t wg_sz;
+    uint32_t dynwaveid;
+    uint32_t maxdynwaveid;
+    uint32_t dispatchid;
+    // outstanding global+local memory requests
+    uint32_t outstanding_reqs;
+    // memory requests between scoreboard
+    // and execute stage not yet executed
+    uint32_t mem_reqs_in_pipe;
+    // outstanding global memory write requests
+    uint32_t outstanding_reqs_wr_gm;
+    // outstanding local memory write requests
+    uint32_t outstanding_reqs_wr_lm;
+    // outstanding global memory read requests
+    uint32_t outstanding_reqs_rd_gm;
+    // outstanding local memory read requests
+    uint32_t outstanding_reqs_rd_lm;
+    uint32_t rd_lm_reqs_in_pipe;
+    uint32_t rd_gm_reqs_in_pipe;
+    uint32_t wr_lm_reqs_in_pipe;
+    uint32_t wr_gm_reqs_in_pipe;
+
+    int mem_trace_busy;
+    uint64_t last_trace;
+    // number of vector registers reserved by WF
+    int reservedVectorRegs;
+    // Index into the Vector Register File's namespace where the WF's registers
+    // will live while the WF is executed
+    uint32_t startVgprIndex;
+
+    // Old value of destination gpr (for trace)
+    uint32_t old_vgpr[VSZ];
+    // Id of destination gpr (for trace)
+    uint32_t old_vgpr_id;
+    // Tick count of last old_vgpr copy
+    uint64_t old_vgpr_tcnt;
+
+    // Old value of destination gpr (for trace)
+    uint64_t old_dgpr[VSZ];
+    // Id of destination gpr (for trace)
+    uint32_t old_dgpr_id;
+    // Tick count of last old_vgpr copy
+    uint64_t old_dgpr_tcnt;
+
+    // Execution mask at wavefront start
+    VectorMask init_mask;
+
+    // number of barriers this WF has joined
+    int bar_cnt[VSZ];
+    int max_bar_cnt;
+    // Flag to stall a wave on barrier
+    bool stalledAtBarrier;
+
+    // a pointer to the fraction of the LDS allocated
+    // to this workgroup (thus this wavefront)
+    LdsChunk *ldsChunk;
+
+    // A pointer to the spill area
+    Addr spillBase;
+    // The size of the spill area
+    uint32_t spillSizePerItem;
+    // The vector width of the spill area
+    uint32_t spillWidth;
+
+    // A pointer to the private memory area
+    Addr privBase;
+    // The size of the private memory area
+    uint32_t privSizePerItem;
+
+    // A pointer ot the read-only memory area
+    Addr roBase;
+    // size of the read-only memory area
+    uint32_t roSize;
+
+    // pointer to buffer for storing kernel arguments
+    uint8_t *kernelArgs;
+    // unique WF id over all WFs executed across all CUs
+    uint64_t wfDynId;
+
+    // number of times instruction issue for this wavefront is blocked
+    // due to VRF port availability
+    Stats::Scalar numTimesBlockedDueVrfPortAvail;
+    // number of times an instruction of a WF is blocked from being issued
+    // due to WAR and WAW dependencies
+    Stats::Scalar numTimesBlockedDueWAXDependencies;
+    // number of times an instruction of a WF is blocked from being issued
+    // due to WAR and WAW dependencies
+    Stats::Scalar numTimesBlockedDueRAWDependencies;
+    // distribution of executed instructions based on their register
+    // operands; this is used to highlight the load on the VRF
+    Stats::Distribution srcRegOpDist;
+    Stats::Distribution dstRegOpDist;
+
+    // Functions to operate on call argument memory
+    // argument memory for hsail call instruction
+    CallArgMem *callArgMem;
+    void
+    initCallArgMem(int func_args_size_per_item)
+    {
+        callArgMem = new CallArgMem(func_args_size_per_item);
+    }
+
+    template<typename CType>
+    CType
+    readCallArgMem(int lane, int addr)
+    {
+        return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
+    }
+
+    template<typename CType>
+    void
+    writeCallArgMem(int lane, int addr, CType val)
+    {
+        callArgMem->setLaneAddr<CType>(lane, addr, val);
+    }
+
+    typedef WavefrontParams Params;
+    Wavefront(const Params *p);
+    ~Wavefront();
+    virtual void init();
+
+    void
+    setParent(ComputeUnit *cu)
+    {
+        computeUnit = cu;
+    }
+
+    void start(uint64_t _wfDynId, uint64_t _base_ptr);
+
+    void exec();
+    void updateResources();
+    int ready(itype_e type);
+    bool instructionBufferHasBranch();
+    void regStats();
+    VectorMask get_pred() { return execMask() & init_mask; }
+
+    bool waitingAtBarrier(int lane);
+
+    void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
+                                  const VectorMask& exec_mask);
+
+    void popFromReconvergenceStack();
+
+    uint32_t pc() const;
+
+    uint32_t rpc() const;
+
+    VectorMask execMask() const;
+
+    bool execMask(int lane) const;
+
+    void pc(uint32_t new_pc);
+
+    void discardFetch();
+
+  private:
+    /**
+     * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
+     * to be visited by the wavefront, and the associated execution masks. The
+     * reconvergence stack grows every time the wavefront reaches a divergence
+     * point (branch instruction), and shrinks every time the wavefront
+     * reaches a reconvergence point (immediate post-dominator instruction).
+     */
+    std::stack<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
+};
+
+#endif // __WAVEFRONT_HH__
diff --git a/src/mem/protocol/GPU_RfO-SQC.sm b/src/mem/protocol/GPU_RfO-SQC.sm
new file mode 100644
index 000000000..1e5f8df74
--- /dev/null
+++ b/src/mem/protocol/GPU_RfO-SQC.sm
@@ -0,0 +1,667 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
+ : Sequencer* sequencer;
+   CacheMemory * L1cache;
+   int TCC_select_num_bits;
+   Cycles issue_latency := 80;  // time to send data down to TCC
+   Cycles l2_hit_latency := 18;
+
+  MessageBuffer * requestFromSQC, network="To", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseFromSQC, network="To", virtual_network="3", vnet_type="response";
+  MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock";
+
+  MessageBuffer * probeToSQC, network="From", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseToSQC, network="From", virtual_network="3", vnet_type="response";
+
+  MessageBuffer * mandatoryQueue;
+{
+  state_declaration(State, desc="SQC Cache States", default="SQC_State_I") {
+    I, AccessPermission:Invalid, desc="Invalid";
+    S, AccessPermission:Read_Only, desc="Shared";
+
+    I_S, AccessPermission:Busy, desc="Invalid, issued RdBlkS, have not seen response yet";
+    S_I, AccessPermission:Read_Only, desc="L1 replacement, waiting for clean WB ack";
+    I_C, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from TCCdir for canceled WB";
+  }
+
+  enumeration(Event, desc="SQC Events") {
+    // Core initiated
+    Fetch,          desc="Fetch";
+
+    //TCC initiated
+    TCC_AckS,        desc="TCC Ack to Core Request";
+    TCC_AckWB,       desc="TCC Ack for WB";
+    TCC_NackWB,       desc="TCC Nack for WB";
+
+    // Mem sys initiated
+    Repl,           desc="Replacing block from cache";
+
+    // Probe Events
+    PrbInvData,         desc="probe, return M data";
+    PrbInv,             desc="probe, no need for data";
+    PrbShrData,         desc="probe downgrade, return data";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff than memory)?";
+    DataBlock DataBlk,          desc="data for the block";
+    bool FromL2, default="false", desc="block just moved from L2";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,             desc="Transient state";
+    DataBlock DataBlk,       desc="data for the block, required for concurrent writebacks";
+    bool Dirty,              desc="Is the data dirty (different than memory)?";
+    int NumPendingMsgs,      desc="Number of acks/data messages that this processor is waiting for";
+    bool Shared,             desc="Victim hit by shared probe";
+   }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<SQC_TBE>", constructor="m_number_of_TBEs";
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  // Internal functions
+  Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+    Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address));
+    return cache_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    } else {
+      return getCacheEntry(addr).DataBlk;
+    }
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if(is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+      tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+      cache_entry.CacheState := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return SQC_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return SQC_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(SQC_State_to_permission(state));
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+        L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+        L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  // Out Ports
+
+  out_port(requestNetwork_out, CPURequestMsg, requestFromSQC);
+  out_port(responseNetwork_out, ResponseMsg, responseFromSQC);
+  out_port(unblockNetwork_out, UnblockMsg, unblockFromCore);
+
+  // In Ports
+
+  in_port(probeNetwork_in, TDProbeRequestMsg, probeToSQC) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, TDProbeRequestMsg, block_on="addr") {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == ProbeRequestType:PrbInv) {
+          if (in_msg.ReturnData) {
+            trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+          assert(in_msg.ReturnData);
+          trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+  in_port(responseToSQC_in, ResponseMsg, responseToSQC) {
+    if (responseToSQC_in.isReady(clockEdge())) {
+      peek(responseToSQC_in, ResponseMsg, block_on="addr") {
+
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == CoherenceResponseType:TDSysResp) {
+          if (in_msg.State == CoherenceState:Shared) {
+            trigger(Event:TCC_AckS, in_msg.addr, cache_entry, tbe);
+          } else {
+            error("SQC should not receive TDSysResp other than CoherenceState:Shared");
+          }
+        } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck) {
+          trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:TDSysWBNack) {
+          trigger(Event:TCC_NackWB, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+
+  in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+    if (mandatoryQueue_in.isReady(clockEdge())) {
+      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+        Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+        TBE tbe := TBEs.lookup(in_msg.LineAddress);
+
+        assert(in_msg.Type == RubyRequestType:IFETCH);
+        if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
+          trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
+        } else {
+          Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
+          trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+        }
+      }
+    }
+  }
+
+  // Actions
+
+  action(ic_invCache, "ic", desc="invalidate cache") {
+    if(is_valid(cache_entry)) {
+      L1cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkS;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(vc_victim, "vc", desc="Victimize E/S Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicClean;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:S) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(a_allocate, "a", desc="allocate block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L1cache.allocate(address, new Entry));
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    assert(is_valid(cache_entry));
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+    tbe.DataBlk := cache_entry.DataBlk;  // Data only used for WBs
+    tbe.Dirty := cache_entry.Dirty;
+    tbe.Shared := false;
+  }
+
+  action(d_deallocateTBE, "d", desc="Deallocate TBE") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+    mandatoryQueue_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+    responseToSQC_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(l_loadDone, "l", desc="local load done") {
+    assert(is_valid(cache_entry));
+    sequencer.readCallback(address, cache_entry.DataBlk,
+                           false, MachineType:L1Cache);
+    APPEND_TRANSITION_COMMENT(cache_entry.DataBlk);
+  }
+
+  action(xl_loadDone, "xl", desc="remote load done") {
+    peek(responseToSQC_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      sequencer.readCallback(address,
+                             cache_entry.DataBlk,
+                             false,
+                             machineIDToMachineType(in_msg.Sender),
+                             in_msg.InitialRequestTime,
+                             in_msg.ForwardRequestTime,
+                             in_msg.ProbeRequestStartTime);
+      APPEND_TRANSITION_COMMENT(cache_entry.DataBlk);
+    }
+  }
+
+  action(w_writeCache, "w", desc="write data to cache") {
+    peek(responseToSQC_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") {
+    peek(responseToSQC_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:StaleNotif;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                TCC_select_low_bit, TCC_select_num_bits));
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(wb_data, "wb", desc="write back data") {
+    peek(responseToSQC_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:CPUData;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                TCC_select_low_bit, TCC_select_num_bits));
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        if (tbe.Shared) {
+          out_msg.NbReqShared := true;
+        } else {
+          out_msg.NbReqShared := false;
+        }
+        out_msg.State := CoherenceState:Shared; // faux info
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;
+      out_msg.Ntsl := true;
+      out_msg.Hit := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(prm_sendProbeResponseMiss, "prm", desc="send probe ack PrbShrData, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;  // only true if sending back data i think
+      out_msg.Hit := false;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry) || is_valid(tbe));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.DataBlk := getDataBlock(address);
+      if (is_valid(tbe)) {
+        out_msg.Dirty := tbe.Dirty;
+      } else {
+        out_msg.Dirty := cache_entry.Dirty;
+      }
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry) || is_valid(tbe));
+      assert(is_valid(cache_entry));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.DataBlk := getDataBlock(address);
+      if (is_valid(tbe)) {
+        out_msg.Dirty := tbe.Dirty;
+      } else {
+        out_msg.Dirty := cache_entry.Dirty;
+      }
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(sf_setSharedFlip, "sf", desc="hit by shared probe, status may be different") {
+    assert(is_valid(tbe));
+    tbe.Shared := true;
+  }
+
+  action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+    enqueue(unblockNetwork_out, UnblockMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") {
+    probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
+    mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  // Transitions
+
+  // transitions from base
+  transition(I, Fetch, I_S) {TagArrayRead, TagArrayWrite} {
+    a_allocate;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  // simple hit transitions
+  transition(S, Fetch) {TagArrayRead, DataArrayRead} {
+    l_loadDone;
+    p_popMandatoryQueue;
+  }
+
+  // recycles from transients
+  transition({I_S, S_I, I_C}, {Fetch, Repl}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition(S, Repl, S_I) {TagArrayRead} {
+    t_allocateTBE;
+    vc_victim;
+    ic_invCache;
+  }
+
+  // TCC event
+  transition(I_S, TCC_AckS, S) {DataArrayRead, DataArrayWrite} {
+    w_writeCache;
+    xl_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(S_I, TCC_NackWB, I){TagArrayWrite} {
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(S_I, TCC_AckWB, I) {TagArrayWrite} {
+    wb_data;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(I_C, TCC_AckWB, I){TagArrayWrite} {
+    ss_sendStaleNotification;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(I_C, TCC_NackWB, I) {TagArrayWrite} {
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  // Probe transitions
+  transition({S, I}, PrbInvData, I) {TagArrayRead, TagArrayWrite} {
+    pd_sendProbeResponseData;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbInvData, I_C) {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition({S, I}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition({S}, PrbShrData, S) {DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({I, I_C}, PrbShrData) {TagArrayRead} {
+    prm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbInv, I_C){
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(I_S, {PrbInv, PrbInvData}) {} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    a_allocate;  // but make sure there is room for incoming data when it arrives
+    pp_popProbeQueue;
+  }
+
+  transition(I_S, PrbShrData) {} {
+    prm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition(S_I, PrbInvData, I_C) {TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(S_I, PrbInv, I_C) {TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(S_I, PrbShrData) {DataArrayRead} {
+    pd_sendProbeResponseData;
+    sf_setSharedFlip;
+    pp_popProbeQueue;
+  }
+}
diff --git a/src/mem/protocol/GPU_RfO-TCC.sm b/src/mem/protocol/GPU_RfO-TCC.sm
new file mode 100644
index 000000000..cfddb3f00
--- /dev/null
+++ b/src/mem/protocol/GPU_RfO-TCC.sm
@@ -0,0 +1,1199 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:TCC, "TCC Cache")
+ : CacheMemory * L2cache;
+   WireBuffer * w_reqToTCCDir;
+   WireBuffer * w_respToTCCDir;
+   WireBuffer * w_TCCUnblockToTCCDir;
+   WireBuffer * w_reqToTCC;
+   WireBuffer * w_probeToTCC;
+   WireBuffer * w_respToTCC;
+   int TCC_select_num_bits;
+   Cycles l2_request_latency := 1;
+   Cycles l2_response_latency := 20;
+
+  // To the general response network
+  MessageBuffer * responseFromTCC, network="To", virtual_network="3", vnet_type="response";
+
+  // From the general response network
+  MessageBuffer * responseToTCC, network="From", virtual_network="3", vnet_type="response";
+
+{
+  // EVENTS
+  enumeration(Event, desc="TCC Events") {
+    // Requests coming from the Cores
+    RdBlk,                  desc="CPU RdBlk event";
+    RdBlkM,                 desc="CPU RdBlkM event";
+    RdBlkS,                 desc="CPU RdBlkS event";
+    CtoD,                   desc="Change to Dirty request";
+    WrVicBlk,               desc="L1 Victim (dirty)";
+    WrVicBlkShared,               desc="L1 Victim (dirty)";
+    ClVicBlk,               desc="L1 Victim (clean)";
+    ClVicBlkShared,               desc="L1 Victim (clean)";
+
+    CPUData,                      desc="WB data from CPU";
+    CPUDataShared,                desc="WB data from CPU, NBReqShared 1";
+    StaleWB,                desc="Stale WB, No data";
+
+    L2_Repl,             desc="L2 Replacement";
+
+    // Probes
+    PrbInvData,         desc="Invalidating probe, return dirty data";
+    PrbInv,             desc="Invalidating probe, no need to return data";
+    PrbShrData,         desc="Downgrading probe, return data";
+
+    // Coming from Memory Controller
+    WBAck,                     desc="ack from memory";
+
+    CancelWB,                   desc="Cancel WB from L2";
+  }
+
+  // STATES
+  state_declaration(State, desc="TCC State", default="TCC_State_I") {
+    M, AccessPermission:Read_Write, desc="Modified";  // No other cache has copy, memory stale
+    O, AccessPermission:Read_Only, desc="Owned";     // Correct most recent copy, others may exist in S
+    E, AccessPermission:Read_Write, desc="Exclusive"; // Correct, most recent, and only copy (and == Memory)
+    S, AccessPermission:Read_Only, desc="Shared";    // Correct, most recent. If no one in O, then == Memory
+    I, AccessPermission:Invalid, desc="Invalid";
+
+    I_M, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data";
+    I_O, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data";
+    I_E, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data";
+    I_S, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data";
+    S_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to M";
+    S_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O";
+    S_E, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to E";
+    S_S, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to S";
+    E_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O";
+    E_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O";
+    E_E, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O";
+    E_S, AccessPermission:Busy, desc="Shared, received WrVicBlk, sent Ack, waiting for Data";
+    O_M, AccessPermission:Busy, desc="...";
+    O_O, AccessPermission:Busy, desc="...";
+    O_E, AccessPermission:Busy, desc="...";
+    M_M, AccessPermission:Busy, desc="...";
+    M_O, AccessPermission:Busy, desc="...";
+    M_E, AccessPermission:Busy, desc="...";
+    M_S, AccessPermission:Busy, desc="...";
+    D_I, AccessPermission:Invalid,  desc="drop WB data on the floor when receive";
+    MOD_I, AccessPermission:Busy, desc="drop WB data on the floor, waiting for WBAck from Mem";
+    MO_I, AccessPermission:Busy, desc="M or O, received L2_Repl, waiting for WBAck from Mem";
+    ES_I, AccessPermission:Busy, desc="E or S, received L2_Repl, waiting for WBAck from Mem";
+    I_C, AccessPermission:Invalid, desc="sent cancel, just waiting to receive mem wb ack so nothing gets confused";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+
+  // STRUCTURES
+
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff from memory?)";
+    DataBlock DataBlk,          desc="Data for the block";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,     desc="Transient state";
+    DataBlock DataBlk,  desc="data for the block";
+    bool Dirty,         desc="Is the data dirty?";
+    bool Shared,        desc="Victim hit by shared probe";
+    MachineID From,     desc="Waiting for writeback from...";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<TCC_TBE>", constructor="m_number_of_TBEs";
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+
+
+  // FUNCTION DEFINITIONS
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+    return static_cast(Entry, "pointer", L2cache.lookup(addr));
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    return getCacheEntry(addr).DataBlk;
+  }
+
+  bool presentOrAvail(Addr addr) {
+    return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr);
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if (is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+        tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+        cache_entry.CacheState := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return TCC_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return TCC_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(TCC_State_to_permission(state));
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+        L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+        L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+        L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+        L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+
+
+  // OUT PORTS
+  out_port(w_requestNetwork_out, CPURequestMsg, w_reqToTCCDir);
+  out_port(w_TCCResp_out, ResponseMsg, w_respToTCCDir);
+  out_port(responseNetwork_out, ResponseMsg, responseFromTCC);
+  out_port(w_unblockNetwork_out, UnblockMsg, w_TCCUnblockToTCCDir);
+
+  // IN PORTS
+  in_port(TDResponse_in, ResponseMsg, w_respToTCC) {
+    if (TDResponse_in.isReady(clockEdge())) {
+      peek(TDResponse_in, ResponseMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:TDSysWBAck) {
+          trigger(Event:WBAck, in_msg.addr, cache_entry, tbe);
+        }
+        else {
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+          error("Error on TDResponse Type");
+        }
+      }
+    }
+  }
+
+  // Response Network
+  in_port(responseNetwork_in, ResponseMsg, responseToTCC) {
+    if (responseNetwork_in.isReady(clockEdge())) {
+      peek(responseNetwork_in, ResponseMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:CPUData) {
+          if (in_msg.NbReqShared) {
+            trigger(Event:CPUDataShared, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:CPUData, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
+            trigger(Event:StaleWB, in_msg.addr, cache_entry, tbe);
+        } else {
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+          error("Error on TDResponse Type");
+        }
+      }
+    }
+  }
+
+  // probe network
+  in_port(probeNetwork_in, TDProbeRequestMsg, w_probeToTCC) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, TDProbeRequestMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == ProbeRequestType:PrbInv) {
+          if (in_msg.ReturnData) {
+            trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+          if (in_msg.ReturnData) {
+            trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+          } else {
+            error("Don't think I should get any of these");
+          }
+        }
+      }
+    }
+  }
+
+  // Request Network
+  in_port(requestNetwork_in, CPURequestMsg, w_reqToTCC) {
+    if (requestNetwork_in.isReady(clockEdge())) {
+      peek(requestNetwork_in, CPURequestMsg) {
+        assert(in_msg.Destination.isElement(machineID));
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == CoherenceRequestType:RdBlk) {
+          trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+          trigger(Event:RdBlkS, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+          trigger(Event:RdBlkM, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+          if (presentOrAvail(in_msg.addr)) {
+            if (in_msg.Shared) {
+              trigger(Event:ClVicBlkShared, in_msg.addr, cache_entry, tbe);
+            } else {
+              trigger(Event:ClVicBlk, in_msg.addr, cache_entry, tbe);
+            }
+          } else {
+            Addr victim :=  L2cache.cacheProbe(in_msg.addr);
+            trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+          if (presentOrAvail(in_msg.addr)) {
+            if (in_msg.Shared) {
+              trigger(Event:WrVicBlkShared, in_msg.addr, cache_entry, tbe);
+            } else {
+              trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
+            }
+          } else {
+            Addr victim := L2cache.cacheProbe(in_msg.addr);
+            trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        } else {
+            requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+        }
+      }
+    }
+  }
+
+  // BEGIN ACTIONS
+
+  action(i_invL2, "i", desc="invalidate TCC cache block") {
+    if (is_valid(cache_entry)) {
+        L2cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(rm_sendResponseM, "rm", desc="send Modified response") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := cache_entry.Dirty;
+        out_msg.State := CoherenceState:Modified;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(rs_sendResponseS, "rs", desc="send Shared response") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := cache_entry.Dirty;
+        out_msg.State := CoherenceState:Shared;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+
+  action(r_requestToTD, "r", desc="Miss in L2, pass on") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) {
+        out_msg.addr := address;
+        out_msg.Type := in_msg.Type;
+        out_msg.Requestor := in_msg.Requestor;
+        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                                TCC_select_low_bit, TCC_select_num_bits));
+        out_msg.Shared := false; // unneeded for this request
+        out_msg.MessageSize := in_msg.MessageSize;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+    if (is_valid(cache_entry)) {
+      tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs
+      tbe.Dirty := cache_entry.Dirty;
+    }
+    tbe.From := machineID;
+  }
+
+  action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(vc_vicClean, "vc", desc="Victimize Clean L2 data") {
+    enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:VicClean;
+      out_msg.Requestor := machineID;
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(vd_vicDirty, "vd", desc="Victimize dirty L2 data") {
+    enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:VicDirty;
+      out_msg.Requestor := machineID;
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysWBAck;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+      }
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(ph_sendProbeResponseHit, "ph", desc="send probe ack, no data") {
+    enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;
+      out_msg.Hit := true;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pm_sendProbeResponseMiss, "pm", desc="send probe ack, no data") {
+    enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+    enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.DataBlk := cache_entry.DataBlk;
+      //assert(cache_entry.Dirty); Not needed in TCC where TCC can supply clean data
+      out_msg.Dirty := cache_entry.Dirty;
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") {
+    enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.DataBlk := tbe.DataBlk;
+      //assert(tbe.Dirty);
+      out_msg.Dirty := tbe.Dirty;
+      out_msg.Hit := true;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.State := CoherenceState:NA;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(mc_cancelMemWriteback, "mc", desc="send writeback cancel to memory") {
+    enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:WrCancel;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+    }
+  }
+
+  action(a_allocateBlock, "a", desc="allocate TCC block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L2cache.allocate(address, new Entry));
+    }
+  }
+
+  action(d_writeData, "d", desc="write data to TCC") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.Dirty) {
+        cache_entry.Dirty := in_msg.Dirty;
+      }
+      cache_entry.DataBlk := in_msg.DataBlk;
+      DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
+    }
+  }
+
+  action(rd_copyDataFromRequest, "rd", desc="write data to TCC") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := true;
+    }
+  }
+
+  action(f_setFrom, "f", desc="set who WB is expected to come from") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      tbe.From := in_msg.Requestor;
+    }
+  }
+
+  action(rf_resetFrom, "rf", desc="reset From") {
+    tbe.From := machineID;
+  }
+
+  action(wb_data, "wb", desc="write back data") {
+    enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUData;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Shared) {
+        out_msg.NbReqShared := true;
+      } else {
+        out_msg.NbReqShared := false;
+      }
+      out_msg.State := CoherenceState:Shared; // faux info
+      out_msg.MessageSize := MessageSizeType:Writeback_Data;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(wt_writeDataToTBE, "wt", desc="write WB data to TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      tbe.DataBlk := in_msg.DataBlk;
+      tbe.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(uo_sendUnblockOwner, "uo", desc="state changed to E, M, or O, unblock") {
+    enqueue(w_unblockNetwork_out, UnblockMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      out_msg.currentOwner := true;
+      out_msg.valid := true;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(us_sendUnblockSharer, "us", desc="state changed to S , unblock") {
+    enqueue(w_unblockNetwork_out, UnblockMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      out_msg.currentOwner := false;
+      out_msg.valid := true;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(un_sendUnblockNotValid, "un", desc="state changed toI, unblock") {
+    enqueue(w_unblockNetwork_out, UnblockMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      out_msg.currentOwner := false;
+      out_msg.valid := false;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
+    L2cache.setMRU(address);
+  }
+
+  action(p_popRequestQueue, "p", desc="pop request queue") {
+    requestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="pop response queue") {
+    responseNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pn_popTDResponseQueue, "pn", desc="pop TD response queue") {
+    TDResponse_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(zz_recycleRequestQueue, "\z", desc="recycle request queue") {
+    requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+
+  // END ACTIONS
+
+  // BEGIN TRANSITIONS
+
+  // transitions from base
+
+  transition({I, I_C}, {RdBlk, RdBlkS, RdBlkM, CtoD}){TagArrayRead} {
+    // TCCdir already knows that the block is not here. This is to allocate and get the block.
+    r_requestToTD;
+    p_popRequestQueue;
+  }
+
+// check
+  transition({M, O}, RdBlk, O){TagArrayRead, TagArrayWrite} {
+    rs_sendResponseS;
+    ut_updateTag;
+    // detect 2nd chancing
+    p_popRequestQueue;
+  }
+
+//check
+  transition({E, S}, RdBlk, S){TagArrayRead, TagArrayWrite} {
+    rs_sendResponseS;
+    ut_updateTag;
+    // detect 2nd chancing
+    p_popRequestQueue;
+  }
+
+// check
+  transition({M, O}, RdBlkS, O){TagArrayRead, TagArrayWrite} {
+    rs_sendResponseS;
+    ut_updateTag;
+    // detect 2nd chance sharing
+    p_popRequestQueue;
+  }
+
+//check
+  transition({E, S}, RdBlkS, S){TagArrayRead, TagArrayWrite} {
+    rs_sendResponseS;
+    ut_updateTag;
+    // detect 2nd chance sharing
+    p_popRequestQueue;
+  }
+
+// check
+  transition(M, RdBlkM, I){TagArrayRead, TagArrayWrite} {
+    rm_sendResponseM;
+    i_invL2;
+    p_popRequestQueue;
+  }
+
+  //check
+  transition(E, RdBlkM, I){TagArrayRead, TagArrayWrite} {
+    rm_sendResponseM;
+    i_invL2;
+    p_popRequestQueue;
+  }
+
+// check
+  transition({I}, WrVicBlk, I_M){TagArrayRead} {
+    a_allocateBlock;
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(I_C, {WrVicBlk, WrVicBlkShared, ClVicBlk, ClVicBlkShared}) {
+    zz_recycleRequestQueue;
+  }
+
+//check
+  transition({I}, WrVicBlkShared, I_O) {TagArrayRead}{
+    a_allocateBlock;
+    t_allocateTBE;
+    f_setFrom;
+//    rd_copyDataFromRequest;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+//check
+  transition(S, WrVicBlkShared, S_O){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+ transition(S, WrVicBlk, S_S){TagArrayRead} {
+   t_allocateTBE;
+   f_setFrom;
+   w_sendResponseWBAck;
+   p_popRequestQueue;
+ }
+
+// a stale writeback
+  transition(E, WrVicBlk, E_E){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+  transition(E, WrVicBlkShared, E_E){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+  transition(O, WrVicBlk, O_O){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+ transition(O, WrVicBlkShared, O_O){TagArrayRead} {
+   t_allocateTBE;
+   f_setFrom;
+   w_sendResponseWBAck;
+   p_popRequestQueue;
+ }
+
+// a stale writeback
+  transition(M, WrVicBlk, M_M){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+  transition(M, WrVicBlkShared, M_O){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+//check
+  transition({I}, ClVicBlk, I_E){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    a_allocateBlock;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition({I}, ClVicBlkShared, I_S){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    a_allocateBlock;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+//check
+  transition(S, ClVicBlkShared, S_S){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+  transition(E, ClVicBlk, E_E){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+  transition(E, ClVicBlkShared, E_S){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+ transition(O, ClVicBlk, O_O){TagArrayRead} {
+   t_allocateTBE;
+   f_setFrom;
+   w_sendResponseWBAck;
+   p_popRequestQueue;
+ }
+
+// check. Original L3 ahd it going from O to O_S. Something can go from O to S only on writeback.
+  transition(O, ClVicBlkShared, O_O){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+ transition(M, ClVicBlk, M_E){TagArrayRead} {
+   t_allocateTBE;
+   f_setFrom;
+   w_sendResponseWBAck;
+   p_popRequestQueue;
+ }
+
+// a stale writeback
+ transition(M, ClVicBlkShared, M_S){TagArrayRead} {
+   t_allocateTBE;
+   f_setFrom;
+   w_sendResponseWBAck;
+   p_popRequestQueue;
+ }
+
+
+  transition({MO_I}, {RdBlk, RdBlkS, RdBlkM, CtoD}) {
+    a_allocateBlock;
+    t_allocateTBE;
+    f_setFrom;
+    r_requestToTD;
+    p_popRequestQueue;
+  }
+
+  transition(MO_I, {WrVicBlkShared, WrVicBlk, ClVicBlk, ClVicBlkShared}, MOD_I) {
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(I_M, CPUData, M){TagArrayWrite} {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M, CPUDataShared, O){TagArrayWrite, DataArrayWrite} {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_O, {CPUData, CPUDataShared}, O){TagArrayWrite, DataArrayWrite}  {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E, CPUData, E){TagArrayWrite, DataArrayWrite}  {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E, CPUDataShared, S){TagArrayWrite, DataArrayWrite}  {
+    us_sendUnblockSharer;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_S, {CPUData, CPUDataShared}, S){TagArrayWrite, DataArrayWrite}  {
+    us_sendUnblockSharer;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(S_M, CPUDataShared, O){TagArrayWrite, DataArrayWrite}  {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(S_O, {CPUData, CPUDataShared}, O){TagArrayWrite, DataArrayWrite}  {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(S_E, CPUDataShared, S){TagArrayWrite, DataArrayWrite}  {
+    us_sendUnblockSharer;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(S_S, {CPUData, CPUDataShared}, S){TagArrayWrite, DataArrayWrite}  {
+    us_sendUnblockSharer;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(O_E, CPUDataShared, O){TagArrayWrite, DataArrayWrite}  {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(O_O, {CPUData, CPUDataShared}, O){TagArrayWrite, DataArrayWrite}  {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition({D_I}, {CPUData, CPUDataShared}, I){TagArrayWrite}  {
+    un_sendUnblockNotValid;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(MOD_I, {CPUData, CPUDataShared}, MO_I) {
+    un_sendUnblockNotValid;
+    rf_resetFrom;
+    pr_popResponseQueue;
+  }
+
+  transition({O,S,I}, CPUData) {
+    pr_popResponseQueue;
+  }
+
+  transition({M, O}, L2_Repl, MO_I){TagArrayRead, DataArrayRead} {
+    t_allocateTBE;
+    vd_vicDirty;
+    i_invL2;
+  }
+
+  transition({E, S,}, L2_Repl, ES_I){TagArrayRead, DataArrayRead} {
+    t_allocateTBE;
+    vc_vicClean;
+    i_invL2;
+  }
+
+  transition({I_M, I_O, S_M, S_O, E_M, E_O}, L2_Repl) {
+    zz_recycleRequestQueue;
+  }
+
+  transition({O_M, O_O, O_E, M_M, M_O, M_E, M_S}, L2_Repl) {
+    zz_recycleRequestQueue;
+  }
+
+  transition({I_E, I_S, S_E, S_S, E_E, E_S}, L2_Repl) {
+    zz_recycleRequestQueue;
+  }
+
+  transition({M, O}, PrbInvData, I){TagArrayRead, TagArrayWrite} {
+    pd_sendProbeResponseData;
+    i_invL2;
+    pp_popProbeQueue;
+  }
+
+  transition(I, PrbInvData){TagArrayRead, TagArrayWrite}  {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({E, S}, PrbInvData, I){TagArrayRead, TagArrayWrite}  {
+    pd_sendProbeResponseData;
+    i_invL2;
+    pp_popProbeQueue;
+  }
+
+  transition({M, O, E, S, I}, PrbInv, I){TagArrayRead, TagArrayWrite}  {
+    pi_sendProbeResponseInv;
+    i_invL2; // nothing will happen in I
+    pp_popProbeQueue;
+  }
+
+  transition({M, O}, PrbShrData, O){TagArrayRead, TagArrayWrite}  {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({E, S}, PrbShrData, S){TagArrayRead, TagArrayWrite}  {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition(I, PrbShrData){TagArrayRead}  {
+    pm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbInvData, I_C) {
+    pdt_sendProbeResponseDataFromTBE;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, PrbInvData, I_C) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({ES_I,MO_I}, PrbInv, I_C) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({ES_I, MO_I}, PrbShrData) {
+    pdt_sendProbeResponseDataFromTBE;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, {PrbInvData, PrbInv}) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbShrData) {
+    pm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition(MOD_I, WBAck, D_I) {
+    pn_popTDResponseQueue;
+  }
+
+  transition(MO_I, WBAck, I){TagArrayWrite} {
+    dt_deallocateTBE;
+    pn_popTDResponseQueue;
+  }
+
+  // this can only be a spurious CPUData from a shared block.
+  transition(MO_I, CPUData) {
+    pr_popResponseQueue;
+  }
+
+  transition(ES_I, WBAck, I){TagArrayWrite} {
+    dt_deallocateTBE;
+    pn_popTDResponseQueue;
+  }
+
+  transition(I_C, {WBAck}, I){TagArrayWrite} {
+    dt_deallocateTBE;
+    pn_popTDResponseQueue;
+  }
+
+  transition({I_M, I_O, I_E, I_S}, StaleWB, I){TagArrayWrite} {
+    un_sendUnblockNotValid;
+    dt_deallocateTBE;
+    i_invL2;
+    pr_popResponseQueue;
+  }
+
+  transition({S_S, S_O, S_M, S_E}, StaleWB, S){TagArrayWrite} {
+    us_sendUnblockSharer;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition({E_M, E_O, E_E, E_S}, StaleWB, E){TagArrayWrite} {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition({O_M, O_O, O_E}, StaleWB, O){TagArrayWrite} {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition({M_M, M_O, M_E, M_S}, StaleWB, M){TagArrayWrite} {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(D_I, StaleWB, I) {TagArrayWrite}{
+    un_sendUnblockNotValid;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(MOD_I, StaleWB, MO_I) {
+    un_sendUnblockNotValid;
+    rf_resetFrom;
+    pr_popResponseQueue;
+  }
+
+}
diff --git a/src/mem/protocol/GPU_RfO-TCCdir.sm b/src/mem/protocol/GPU_RfO-TCCdir.sm
new file mode 100644
index 000000000..8f58d6ebb
--- /dev/null
+++ b/src/mem/protocol/GPU_RfO-TCCdir.sm
@@ -0,0 +1,2672 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Mithuna Thottethodi
+ */
+
+machine(MachineType:TCCdir, "AMD read-for-ownership directory for TCC (aka GPU L2)")
+:  CacheMemory * directory;
+  // Convention: wire buffers are prefixed with "w_" for clarity
+  WireBuffer * w_reqToTCCDir;
+  WireBuffer * w_respToTCCDir;
+  WireBuffer * w_TCCUnblockToTCCDir;
+  WireBuffer * w_reqToTCC;
+  WireBuffer * w_probeToTCC;
+  WireBuffer * w_respToTCC;
+  int TCC_select_num_bits;
+  Cycles response_latency := 5;
+  Cycles directory_latency := 6;
+  Cycles issue_latency := 120;
+
+  // From the TCPs or SQCs
+  MessageBuffer * requestFromTCP, network="From", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseFromTCP, network="From", virtual_network="3", vnet_type="response";
+  MessageBuffer * unblockFromTCP, network="From", virtual_network="5", vnet_type="unblock";
+
+  // To the Cores. TCC deals only with TCPs/SQCs. CP cores do not communicate directly with TCC.
+  MessageBuffer * probeToCore, network="To", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseToCore, network="To", virtual_network="3", vnet_type="response";
+
+  // From the NB
+  MessageBuffer * probeFromNB, network="From", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseFromNB, network="From", virtual_network="2", vnet_type="response";
+  // To the NB
+  MessageBuffer * requestToNB, network="To", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseToNB, network="To", virtual_network="2", vnet_type="response";
+  MessageBuffer * unblockToNB, network="To", virtual_network="4", vnet_type="unblock";
+
+  MessageBuffer * triggerQueue, random="false";
+{
+  // STATES
+  state_declaration(State, desc="Directory states", default="TCCdir_State_I") {
+    // Base states
+    I, AccessPermission:Invalid, desc="Invalid";
+    S, AccessPermission:Invalid, desc="Shared";
+    E, AccessPermission:Invalid, desc="Shared";
+    O, AccessPermission:Invalid, desc="Owner";
+    M, AccessPermission:Invalid, desc="Modified";
+
+    CP_I, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to invalid";
+    B_I, AccessPermission:Invalid, desc="Blocked, need not send data after acks are in, going to invalid";
+    CP_O, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to owned";
+    CP_S, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to shared";
+    CP_OM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to O_M";
+    CP_SM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to S_M";
+    CP_ISM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to I_M";
+    CP_IOM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to I_M";
+    CP_OSIW, AccessPermission:Invalid, desc="Blocked, must send data after acks+CancelWB are in, going to I_C";
+
+
+    // Transient states and busy states used for handling side (TCC-facing) interactions
+    BW_S, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock";
+    BW_E, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock";
+    BW_O, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock";
+    BW_M, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock";
+
+    // Transient states and busy states used for handling upward (TCP-facing) interactions
+    I_M, AccessPermission:Invalid, desc="Invalid, issued RdBlkM, have not seen response yet";
+    I_ES, AccessPermission:Invalid, desc="Invalid, issued RdBlk, have not seen response yet";
+    I_S, AccessPermission:Invalid, desc="Invalid, issued RdBlkS, have not seen response yet";
+    BBS_S, AccessPermission:Invalid, desc="Blocked, going from S to S";
+    BBO_O, AccessPermission:Invalid, desc="Blocked, going from O to O";
+    BBM_M, AccessPermission:Invalid, desc="Blocked, going from M to M, waiting for data to forward";
+    BBM_O, AccessPermission:Invalid, desc="Blocked, going from M to O, waiting for data to forward";
+    BB_M, AccessPermission:Invalid, desc="Blocked, going from M to M, waiting for unblock";
+    BB_O, AccessPermission:Invalid, desc="Blocked, going from M to O, waiting for unblock";
+    BB_OO, AccessPermission:Invalid, desc="Blocked, going from O to O (adding sharers), waiting for unblock";
+    BB_S, AccessPermission:Invalid, desc="Blocked, going to S, waiting for (possible multiple) unblock(s)";
+    BBS_M, AccessPermission:Invalid, desc="Blocked, going from S or O to M";
+    BBO_M, AccessPermission:Invalid, desc="Blocked, going from S or O to M";
+    BBS_UM, AccessPermission:Invalid, desc="Blocked, going from S or O to M via upgrade";
+    BBO_UM, AccessPermission:Invalid, desc="Blocked, going from S or O to M via upgrade";
+    S_M, AccessPermission:Invalid, desc="Shared, issued CtoD, have not seen response yet";
+    O_M, AccessPermission:Invalid, desc="Shared, issued CtoD, have not seen response yet";
+
+    //
+    BBB_S, AccessPermission:Invalid, desc="Blocked, going to S after core unblock";
+    BBB_M, AccessPermission:Invalid, desc="Blocked, going to M after core unblock";
+    BBB_E, AccessPermission:Invalid, desc="Blocked, going to E after core unblock";
+
+    VES_I, AccessPermission:Invalid, desc="TCC replacement, waiting for clean WB ack";
+    VM_I, AccessPermission:Invalid, desc="TCC replacement, waiting for dirty WB ack";
+    VO_I, AccessPermission:Invalid, desc="TCC replacement, waiting for dirty WB ack";
+    VO_S, AccessPermission:Invalid, desc="TCC owner replacement, waiting for dirty WB ack";
+
+    ES_I, AccessPermission:Invalid, desc="L1 replacement, waiting for clean WB ack";
+    MO_I, AccessPermission:Invalid, desc="L1 replacement, waiting for dirty WB ack";
+
+    I_C, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from NB for canceled WB";
+    I_W, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from NB; canceled WB raced with directory invalidation";
+
+    // Recall States
+    BRWD_I, AccessPermission:Invalid, desc="Recalling, waiting for WBAck and Probe Data responses";
+    BRW_I, AccessPermission:Read_Write, desc="Recalling, waiting for WBAck";
+    BRD_I, AccessPermission:Invalid, desc="Recalling, waiting for Probe Data responses";
+
+  }
+
+ enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+
+
+  // EVENTS
+  enumeration(Event, desc="TCC Directory Events") {
+    // Upward facing events (TCCdir w.r.t. TCP/SQC and TCC behaves like NBdir behaves with TCP/SQC and L3
+
+    // Directory Recall
+    Recall,              desc="directory cache is full";
+    // CPU requests
+    CPUWrite,           desc="Initial req from core, sent to TCC";
+    NoCPUWrite,           desc="Initial req from core, but non-exclusive clean data; can be discarded";
+    CPUWriteCancel,           desc="Initial req from core, sent to TCC";
+
+    // Requests from the TCPs
+    RdBlk,                  desc="RdBlk event";
+    RdBlkM,                 desc="RdBlkM event";
+    RdBlkS,                 desc="RdBlkS event";
+    CtoD,                   desc="Change to Dirty request";
+
+    // TCC writebacks
+    VicDirty,           desc="...";
+    VicDirtyLast,           desc="...";
+    VicClean,           desc="...";
+    NoVic,           desc="...";
+    StaleVic,           desc="...";
+    CancelWB,           desc="TCC got invalidating probe, canceled WB";
+
+    // Probe Responses from TCP/SQCs
+    CPUPrbResp,     desc="Probe response from TCP/SQC";
+    TCCPrbResp,     desc="Probe response from TCC";
+
+    ProbeAcksComplete,	desc="All acks received";
+    ProbeAcksCompleteReissue,	desc="All acks received, changing CtoD to reissue";
+
+    CoreUnblock,		desc="unblock from TCP/SQC";
+    LastCoreUnblock,		desc="Last unblock from TCP/SQC";
+    TCCUnblock,			desc="unblock from TCC (current owner)";
+    TCCUnblock_Sharer,  desc="unblock from TCC (a sharer, not owner)";
+    TCCUnblock_NotValid,desc="unblock from TCC (not valid...caused by stale writebacks)";
+
+    // Downward facing events
+
+    // NB initiated
+    NB_AckS,        desc="NB Ack to TCC Request";
+    NB_AckE,        desc="NB Ack to TCC Request";
+    NB_AckM,        desc="NB Ack to TCC Request";
+    NB_AckCtoD,     desc="NB Ack to TCC Request";
+    NB_AckWB,       desc="NB Ack for clean WB";
+
+
+    // Incoming Probes from NB
+    PrbInvData,         desc="Invalidating probe, return dirty data";
+    PrbInv,             desc="Invalidating probe, no need to return data";
+    PrbShrData,         desc="Downgrading probe, return data";
+  }
+
+
+  // TYPES
+
+  // Entry for directory
+  structure(Entry, desc="...", interface='AbstractCacheEntry') {
+    State CacheState,          desc="Cache state (Cache of directory entries)";
+    DataBlock DataBlk,             desc="data for the block";
+    NetDest Sharers,                   desc="Sharers for this block";
+    NetDest Owner,                     desc="Owner of this block";
+    NetDest MergedSharers,             desc="Read sharers who are merged on a request";
+    int WaitingUnblocks,           desc="Number of acks we're waiting for";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,    desc="Transient state";
+    DataBlock DataBlk, desc="DataBlk";
+    bool Dirty,        desc="Is the data dirty?";
+    MachineID Requestor, desc="requestor";
+    int NumPendingAcks,        desc="num acks expected";
+    MachineID OriginalRequestor,        desc="Original Requestor";
+    MachineID UntransferredOwner,    desc = "Untransferred owner for an upgrade transaction";
+    bool UntransferredOwnerExists,    desc = "1 if Untransferred owner exists for an upgrade transaction";
+    bool Cached,        desc="data hit in Cache";
+    bool Shared,	desc="victim hit by shared probe";
+    bool Upgrade,	desc="An upgrade request in progress";
+    bool CtoD,	desc="Saved sysack info";
+    CoherenceState CohState, desc="Saved sysack info";
+    MessageSizeType MessageSize, desc="Saved sysack info";
+    MachineID Sender, desc="sender";
+  }
+
+  structure(TBETable, external = "yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  // ** OBJECTS **
+  TBETable TBEs, template="<TCCdir_TBE>", constructor="m_number_of_TBEs";
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  NetDest TCC_dir_subtree;
+  NetDest temp;
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+
+
+  bool presentOrAvail(Addr addr) {
+    return directory.isTagPresent(addr) || directory.cacheAvail(addr);
+  }
+
+  Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+    return static_cast(Entry, "pointer", directory.lookup(addr));
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    } else {
+      assert(false);
+      return getCacheEntry(addr).DataBlk;
+    }
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if(is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+ void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(TCCdir_State_to_permission(state));
+    }
+  }
+
+ AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return TCCdir_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return TCCdir_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+      tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+      cache_entry.CacheState := state;
+
+      if (state == State:S) {
+        assert(cache_entry.Owner.count() == 0);
+      }
+
+      if (state == State:O) {
+        assert(cache_entry.Owner.count() == 1);
+        assert(cache_entry.Sharers.isSuperset(cache_entry.Owner) == false);
+      }
+
+      if (state == State:M) {
+        assert(cache_entry.Owner.count() == 1);
+        assert(cache_entry.Sharers.count() == 0);
+      }
+
+      if (state == State:E) {
+        assert(cache_entry.Owner.count() == 0);
+        assert(cache_entry.Sharers.count() == 1);
+      }
+    }
+  }
+
+
+
+ void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+        directory.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+        directory.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+        directory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+        directory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return directory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return directory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return directory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return directory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  // ** OUT_PORTS **
+
+  // Three classes of ports
+  // Class 1: downward facing network links to NB
+  out_port(requestToNB_out, CPURequestMsg, requestToNB);
+  out_port(responseToNB_out, ResponseMsg, responseToNB);
+  out_port(unblockToNB_out, UnblockMsg, unblockToNB);
+
+
+  // Class 2: upward facing ports to GPU cores
+  out_port(probeToCore_out, TDProbeRequestMsg, probeToCore);
+  out_port(responseToCore_out, ResponseMsg, responseToCore);
+
+  // Class 3: sideward facing ports (on "wirebuffer" links) to TCC
+  out_port(w_requestTCC_out, CPURequestMsg, w_reqToTCC);
+  out_port(w_probeTCC_out, NBProbeRequestMsg, w_probeToTCC);
+  out_port(w_respTCC_out, ResponseMsg, w_respToTCC);
+
+
+  // local trigger port
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+
+  //
+  // request queue going to NB
+  //
+
+  // ** IN_PORTS **
+
+  // Trigger Queue
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=8) {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        assert(is_valid(tbe));
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if ((in_msg.Type == TriggerType:AcksComplete) && (tbe.Upgrade == false))  {
+          trigger(Event:ProbeAcksComplete, in_msg.addr, cache_entry, tbe);
+        } else if ((in_msg.Type == TriggerType:AcksComplete) && (tbe.Upgrade == true))  {
+          trigger(Event:ProbeAcksCompleteReissue, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+  // Unblock Networks (TCCdir can receive unblocks from TCC, TCPs)
+  // Port on first (of three) wire buffers from TCC
+  in_port(w_TCCUnblock_in, UnblockMsg, w_TCCUnblockToTCCDir, rank=7) {
+    if (w_TCCUnblock_in.isReady(clockEdge())) {
+      peek(w_TCCUnblock_in, UnblockMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.currentOwner) {
+            trigger(Event:TCCUnblock, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.valid) {
+            trigger(Event:TCCUnblock_Sharer, in_msg.addr, cache_entry, tbe);
+        } else {
+            trigger(Event:TCCUnblock_NotValid, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+  in_port(unblockNetwork_in, UnblockMsg, unblockFromTCP, rank=6) {
+    if (unblockNetwork_in.isReady(clockEdge())) {
+      peek(unblockNetwork_in, UnblockMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if(cache_entry.WaitingUnblocks == 1) {
+          trigger(Event:LastCoreUnblock, in_msg.addr, cache_entry, tbe);
+        }
+        else {
+          trigger(Event:CoreUnblock, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+
+  //Responses from TCC, and Cores
+  // Port on second (of three) wire buffers from TCC
+  in_port(w_TCCResponse_in, ResponseMsg, w_respToTCCDir, rank=5) {
+    if (w_TCCResponse_in.isReady(clockEdge())) {
+      peek(w_TCCResponse_in, ResponseMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+          trigger(Event:TCCPrbResp, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+  in_port(responseNetwork_in, ResponseMsg, responseFromTCP, rank=4) {
+    if (responseNetwork_in.isReady(clockEdge())) {
+      peek(responseNetwork_in, ResponseMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+          trigger(Event:CPUPrbResp, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+
+  // Port on third (of three) wire buffers from TCC
+  in_port(w_TCCRequest_in, CPURequestMsg, w_reqToTCCDir, rank=3) {
+      if(w_TCCRequest_in.isReady(clockEdge())) {
+          peek(w_TCCRequest_in, CPURequestMsg) {
+              TBE tbe := TBEs.lookup(in_msg.addr);
+              Entry cache_entry := getCacheEntry(in_msg.addr);
+              if (in_msg.Type == CoherenceRequestType:WrCancel) {
+                  trigger(Event:CancelWB, in_msg.addr, cache_entry, tbe);
+              } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+                  if (is_valid(cache_entry) && cache_entry.Owner.isElement(in_msg.Requestor)) {
+                      // if modified, or owner with no other sharers
+                      if ((cache_entry.CacheState == State:M) || (cache_entry.Sharers.count() == 0)) {
+                          assert(cache_entry.Owner.count()==1);
+                          trigger(Event:VicDirtyLast, in_msg.addr, cache_entry, tbe);
+                      } else {
+                          trigger(Event:VicDirty, in_msg.addr, cache_entry, tbe);
+                      }
+                  } else {
+                      trigger(Event:StaleVic, in_msg.addr, cache_entry, tbe);
+                  }
+              } else {
+                  if (in_msg.Type == CoherenceRequestType:VicClean) {
+                      if (is_valid(cache_entry) && cache_entry.Sharers.isElement(in_msg.Requestor)) {
+                          if (cache_entry.Sharers.count() == 1) {
+                              // Last copy, victimize to L3
+                              trigger(Event:VicClean, in_msg.addr, cache_entry, tbe);
+                          } else {
+                              // Either not the last copy or stall. No need to victimmize
+                              // remove sharer from sharer list
+                              assert(cache_entry.Sharers.count() > 1);
+                              trigger(Event:NoVic, in_msg.addr, cache_entry, tbe);
+                          }
+                      } else {
+                          trigger(Event:StaleVic, in_msg.addr, cache_entry, tbe);
+                      }
+                  }
+              }
+          }
+      }
+    }
+
+  in_port(responseFromNB_in, ResponseMsg, responseFromNB, rank=2) {
+    if (responseFromNB_in.isReady(clockEdge())) {
+      peek(responseFromNB_in, ResponseMsg, block_on="addr") {
+
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+          if (in_msg.State == CoherenceState:Modified) {
+            if (in_msg.CtoD) {
+              trigger(Event:NB_AckCtoD, in_msg.addr, cache_entry, tbe);
+            } else {
+              trigger(Event:NB_AckM, in_msg.addr, cache_entry, tbe);
+            }
+          } else if (in_msg.State == CoherenceState:Shared) {
+            trigger(Event:NB_AckS, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.State == CoherenceState:Exclusive) {
+            trigger(Event:NB_AckE, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+          trigger(Event:NB_AckWB, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+
+  // Finally handling incoming requests (from TCP) and probes (from NB).
+
+  in_port(probeNetwork_in, NBProbeRequestMsg, probeFromNB, rank=1) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, NBProbeRequestMsg) {
+        DPRINTF(RubySlicc, "%s\n", in_msg);
+        DPRINTF(RubySlicc, "machineID: %s\n", machineID);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == ProbeRequestType:PrbInv) {
+          if (in_msg.ReturnData) {
+            trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+          assert(in_msg.ReturnData);
+          trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+
+  in_port(coreRequestNetwork_in, CPURequestMsg, requestFromTCP, rank=0) {
+    if (coreRequestNetwork_in.isReady(clockEdge())) {
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (presentOrAvail(in_msg.addr)) {
+          if (in_msg.Type == CoherenceRequestType:VicDirty) {
+            trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+              if (is_valid(cache_entry) && cache_entry.Owner.isElement(in_msg.Requestor)) {
+                  trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+              } else if(is_valid(cache_entry) && (cache_entry.Sharers.count() + cache_entry.Owner.count() ) >1) {
+                  trigger(Event:NoCPUWrite, in_msg.addr, cache_entry, tbe);
+              } else {
+                  trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+              }
+          } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
+            trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+            trigger(Event:RdBlkS, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+            trigger(Event:RdBlkM, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:WrCancel) {
+            trigger(Event:CPUWriteCancel, in_msg.addr, cache_entry, tbe);
+          }
+        } else {
+          // All requests require a directory entry
+          Addr victim := directory.cacheProbe(in_msg.addr);
+          trigger(Event:Recall, victim, getCacheEntry(victim), TBEs.lookup(victim));
+        }
+      }
+    }
+  }
+
+
+
+
+  // Actions
+
+  //Downward facing actions
+
+  action(c_clearOwner, "c", desc="Clear the owner field") {
+    cache_entry.Owner.clear();
+  }
+
+  action(rS_removeRequesterFromSharers, "rS", desc="Remove unblocker from sharer list") {
+    peek(unblockNetwork_in, UnblockMsg) {
+      cache_entry.Sharers.remove(in_msg.Sender);
+    }
+  }
+
+  action(rT_removeTCCFromSharers, "rT", desc="Remove  TCC from sharer list") {
+    peek(w_TCCRequest_in, CPURequestMsg) {
+      cache_entry.Sharers.remove(in_msg.Requestor);
+    }
+  }
+
+  action(rO_removeOriginalRequestorFromSharers, "rO", desc="Remove replacing core from sharer list") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      cache_entry.Sharers.remove(in_msg.Requestor);
+    }
+  }
+
+  action(rC_removeCoreFromSharers, "rC", desc="Remove replacing core from sharer list") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      cache_entry.Sharers.remove(in_msg.Requestor);
+    }
+  }
+
+  action(rCo_removeCoreFromOwner, "rCo", desc="Remove replacing core from sharer list") {
+    // Note that under some cases this action will try to remove a stale owner
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      cache_entry.Owner.remove(in_msg.Requestor);
+    }
+  }
+
+  action(rR_removeResponderFromSharers, "rR", desc="Remove responder from sharer list") {
+    peek(responseNetwork_in, ResponseMsg) {
+      cache_entry.Sharers.remove(in_msg.Sender);
+    }
+  }
+
+  action(nC_sendNullWBAckToCore, "nC", desc = "send a null WB Ack to release core") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysWBNack;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := in_msg.MessageSize;
+      }
+   }
+  }
+
+  action(nT_sendNullWBAckToTCC, "nT", desc = "send a null WB Ack to release TCC") {
+    peek(w_TCCRequest_in, CPURequestMsg) {
+      enqueue(w_respTCC_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysWBAck;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := in_msg.MessageSize;
+      }
+    }
+  }
+
+  action(eto_moveExSharerToOwner, "eto", desc="move the current exclusive sharer to owner") {
+      assert(cache_entry.Sharers.count() == 1);
+      assert(cache_entry.Owner.count() == 0);
+      cache_entry.Owner := cache_entry.Sharers;
+      cache_entry.Sharers.clear();
+      APPEND_TRANSITION_COMMENT(" new owner ");
+      APPEND_TRANSITION_COMMENT(cache_entry.Owner);
+  }
+
+  action(aT_addTCCToSharers, "aT", desc="Add TCC to sharer list") {
+    peek(w_TCCUnblock_in, UnblockMsg) {
+      cache_entry.Sharers.add(in_msg.Sender);
+    }
+  }
+
+  action(as_addToSharers, "as", desc="Add unblocker to sharer list") {
+    peek(unblockNetwork_in, UnblockMsg) {
+      cache_entry.Sharers.add(in_msg.Sender);
+    }
+  }
+
+  action(c_moveOwnerToSharer, "cc", desc="Move owner to sharers") {
+    cache_entry.Sharers.addNetDest(cache_entry.Owner);
+    cache_entry.Owner.clear();
+  }
+
+  action(cc_clearSharers, "\c", desc="Clear the sharers field") {
+    cache_entry.Sharers.clear();
+  }
+
+  action(e_ownerIsUnblocker, "e", desc="The owner is now the unblocker") {
+    peek(unblockNetwork_in, UnblockMsg) {
+      cache_entry.Owner.clear();
+      cache_entry.Owner.add(in_msg.Sender);
+      APPEND_TRANSITION_COMMENT(" tcp_ub owner ");
+      APPEND_TRANSITION_COMMENT(cache_entry.Owner);
+    }
+  }
+
+  action(eT_ownerIsUnblocker, "eT", desc="TCC (unblocker) is now owner") {
+    peek(w_TCCUnblock_in, UnblockMsg) {
+      cache_entry.Owner.clear();
+      cache_entry.Owner.add(in_msg.Sender);
+      APPEND_TRANSITION_COMMENT(" tcc_ub owner ");
+      APPEND_TRANSITION_COMMENT(cache_entry.Owner);
+    }
+  }
+
+  action(ctr_copyTCCResponseToTBE, "ctr", desc="Copy TCC probe response data to TBE") {
+    peek(w_TCCResponse_in, ResponseMsg) {
+      // Overwrite data if tbe does not hold dirty data. Stop once it is dirty.
+      if(tbe.Dirty == false) {
+        tbe.DataBlk := in_msg.DataBlk;
+        tbe.Dirty := in_msg.Dirty;
+        tbe.Sender := in_msg.Sender;
+      }
+      DPRINTF(RubySlicc, "%s\n", (tbe.DataBlk));
+    }
+  }
+
+  action(ccr_copyCoreResponseToTBE, "ccr", desc="Copy core probe response data to TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      // Overwrite data if tbe does not hold dirty data. Stop once it is dirty.
+      if(tbe.Dirty == false) {
+          tbe.DataBlk := in_msg.DataBlk;
+          tbe.Dirty := in_msg.Dirty;
+
+          if(tbe.Sender == machineID) {
+              tbe.Sender := in_msg.Sender;
+          }
+      }
+      DPRINTF(RubySlicc, "%s\n", (tbe.DataBlk));
+    }
+  }
+
+  action(cd_clearDirtyBitTBE, "cd", desc="Clear Dirty bit in TBE") {
+      tbe.Dirty := false;
+  }
+
+  action(n_issueRdBlk, "n-", desc="Issue RdBlk") {
+    enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlk;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+    }
+  }
+
+  action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") {
+    enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkS;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+    }
+  }
+
+  action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") {
+    enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkM;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+    }
+  }
+
+  action(rU_rememberUpgrade, "rU", desc="Remember that this was an upgrade") {
+      tbe.Upgrade := true;
+  }
+
+  action(ruo_rememberUntransferredOwner, "ruo", desc="Remember the untransferred owner") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if(in_msg.UntransferredOwner == true) {
+        tbe.UntransferredOwner := in_msg.Sender;
+        tbe.UntransferredOwnerExists := true;
+      }
+      DPRINTF(RubySlicc, "%s\n", (in_msg));
+    }
+  }
+
+  action(ruoT_rememberUntransferredOwnerTCC, "ruoT", desc="Remember the untransferred owner") {
+    peek(w_TCCResponse_in, ResponseMsg) {
+      if(in_msg.UntransferredOwner == true) {
+        tbe.UntransferredOwner := in_msg.Sender;
+        tbe.UntransferredOwnerExists := true;
+      }
+      DPRINTF(RubySlicc, "%s\n", (in_msg));
+    }
+  }
+
+ action(vd_victim, "vd", desc="Victimize M/O Data") {
+   enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+     out_msg.addr := address;
+     out_msg.Requestor := machineID;
+     out_msg.Destination.add(map_Address_to_Directory(address));
+     out_msg.MessageSize := MessageSizeType:Request_Control;
+     out_msg.Type := CoherenceRequestType:VicDirty;
+     if (cache_entry.CacheState == State:O) {
+       out_msg.Shared := true;
+     } else {
+       out_msg.Shared := false;
+     }
+     out_msg.Dirty := true;
+   }
+ }
+
+  action(vc_victim, "vc", desc="Victimize E/S Data") {
+    enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicClean;
+      if (cache_entry.CacheState == State:S) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+      out_msg.Dirty := false;
+    }
+  }
+
+
+  action(sT_sendRequestToTCC, "sT", desc="send request to TCC") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(w_requestTCC_out, CPURequestMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := in_msg.Type;
+        out_msg.Requestor := in_msg.Requestor;
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                TCC_select_low_bit, TCC_select_num_bits));
+        out_msg.Shared := in_msg.Shared;
+        out_msg.MessageSize := in_msg.MessageSize;
+      }
+      APPEND_TRANSITION_COMMENT(" requestor ");
+      APPEND_TRANSITION_COMMENT(in_msg.Requestor);
+
+    }
+  }
+
+
+  action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+
+    temp := cache_entry.Sharers;
+    temp.addNetDest(cache_entry.Owner);
+    if (temp.isElement(tcc)) {
+        temp.remove(tcc);
+    }
+    if (temp.count() > 0) {
+      enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbDowngrade;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination := temp;
+        tbe.NumPendingAcks := temp.count();
+        if(cache_entry.CacheState == State:M) {
+            assert(tbe.NumPendingAcks == 1);
+        }
+        DPRINTF(RubySlicc, "%s\n", (out_msg));
+      }
+    }
+  }
+
+  action(ls2_probeShrL2Data, "ls2", desc="local probe downgrade L2, return data") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+    if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) {
+      enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := ProbeRequestType:PrbDowngrade;
+          out_msg.ReturnData := true;
+          out_msg.MessageSize := MessageSizeType:Control;
+          out_msg.Destination.add(tcc);
+          tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+
+      }
+    }
+  }
+
+  action(s2_probeShrL2Data, "s2", desc="probe shared L2, return data") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+    if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) {
+      enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := ProbeRequestType:PrbDowngrade;
+          out_msg.ReturnData := true;
+          out_msg.MessageSize := MessageSizeType:Control;
+          out_msg.Destination.add(tcc);
+          tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+
+      }
+    }
+  }
+
+  action(ldc_probeInvCoreData, "ldc", desc="local probe  to inv cores, return data") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+        NetDest dest:= cache_entry.Sharers;
+        dest.addNetDest(cache_entry.Owner);
+        if(dest.isElement(tcc)){
+         dest.remove(tcc);
+        }
+        dest.remove(in_msg.Requestor);
+        tbe.NumPendingAcks := dest.count();
+        if (dest.count()>0){
+        enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbInv;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+
+        out_msg.Destination.addNetDest(dest);
+        if(cache_entry.CacheState == State:M) {
+		assert(tbe.NumPendingAcks == 1);
+        }
+
+        DPRINTF(RubySlicc, "%s\n", (out_msg));
+       }
+      }
+    }
+  }
+
+  action(ld2_probeInvL2Data, "ld2", desc="local probe inv L2, return data") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+    if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) {
+      enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := ProbeRequestType:PrbInv;
+          out_msg.ReturnData := true;
+          out_msg.MessageSize := MessageSizeType:Control;
+          out_msg.Destination.add(tcc);
+          tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+
+      }
+    }
+  }
+
+  action(dc_probeInvCoreData, "dc", desc="probe  inv cores + TCC, return data") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+    enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := ProbeRequestType:PrbInv;
+      out_msg.ReturnData := true;
+      out_msg.MessageSize := MessageSizeType:Control;
+
+      out_msg.Destination.addNetDest(cache_entry.Sharers);
+      out_msg.Destination.addNetDest(cache_entry.Owner);
+      tbe.NumPendingAcks := cache_entry.Sharers.count() + cache_entry.Owner.count();
+      if(cache_entry.CacheState == State:M) {
+	  assert(tbe.NumPendingAcks == 1);
+      }
+      if (out_msg.Destination.isElement(tcc)) {
+          out_msg.Destination.remove(tcc);
+          tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+      }
+
+      DPRINTF(RubySlicc, "%s\n", (out_msg));
+    }
+  }
+
+  action(d2_probeInvL2Data, "d2", desc="probe inv L2, return data") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+    if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) {
+      enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := ProbeRequestType:PrbInv;
+          out_msg.ReturnData := true;
+          out_msg.MessageSize := MessageSizeType:Control;
+          out_msg.Destination.add(tcc);
+          tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+
+      }
+    }
+  }
+
+  action(lpc_probeInvCore, "lpc", desc="local probe inv cores, no data") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      TCC_dir_subtree.broadcast(MachineType:TCP);
+      TCC_dir_subtree.broadcast(MachineType:SQC);
+
+      temp := cache_entry.Sharers;
+      temp := temp.OR(cache_entry.Owner);
+      TCC_dir_subtree := TCC_dir_subtree.AND(temp);
+      tbe.NumPendingAcks := TCC_dir_subtree.count();
+      if(cache_entry.CacheState == State:M) {
+	   assert(tbe.NumPendingAcks == 1);
+      }
+      if(TCC_dir_subtree.isElement(in_msg.Requestor)) {
+         TCC_dir_subtree.remove(in_msg.Requestor);
+         tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+      }
+
+      if(TCC_dir_subtree.count() > 0) {
+         enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+           out_msg.addr := address;
+           out_msg.Type := ProbeRequestType:PrbInv;
+           out_msg.ReturnData := false;
+           out_msg.MessageSize := MessageSizeType:Control;
+           out_msg.localCtoD := true;
+
+           out_msg.Destination.addNetDest(TCC_dir_subtree);
+
+           DPRINTF(RubySlicc, "%s\n", (out_msg));
+         }
+       }
+    }
+  }
+
+  action(ipc_probeInvCore, "ipc", desc="probe inv cores, no data") {
+    TCC_dir_subtree.broadcast(MachineType:TCP);
+    TCC_dir_subtree.broadcast(MachineType:SQC);
+
+    temp := cache_entry.Sharers;
+    temp := temp.OR(cache_entry.Owner);
+    TCC_dir_subtree := TCC_dir_subtree.AND(temp);
+    tbe.NumPendingAcks := TCC_dir_subtree.count();
+    if(TCC_dir_subtree.count() > 0) {
+
+      enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbInv;
+        out_msg.ReturnData := false;
+        out_msg.MessageSize := MessageSizeType:Control;
+
+        out_msg.Destination.addNetDest(TCC_dir_subtree);
+        if(cache_entry.CacheState == State:M) {
+          assert(tbe.NumPendingAcks == 1);
+        }
+
+        DPRINTF(RubySlicc, "%s\n", (out_msg));
+      }
+    }
+  }
+
+  action(i2_probeInvL2, "i2", desc="probe inv L2, no data") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+    if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) {
+      enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+          tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+          out_msg.addr := address;
+          out_msg.Type := ProbeRequestType:PrbInv;
+          out_msg.ReturnData := false;
+          out_msg.MessageSize := MessageSizeType:Control;
+          out_msg.Destination.add(tcc);
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+
+      }
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC, L3  respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") {
+    enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and TCC respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.Dirty := false;
+      out_msg.Ntsl := true;
+      out_msg.Hit := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(prm_sendProbeResponseMiss, "prm", desc="send probe ack PrbShrData, no data") {
+    enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and TCC respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.Dirty := false;  // only true if sending back data i think
+      out_msg.Hit := false;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+
+
+  action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+    enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry) || is_valid(tbe));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.DataBlk := getDataBlock(address);
+      if (is_valid(tbe)) {
+        out_msg.Dirty := tbe.Dirty;
+      }
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+
+  action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") {
+    enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry) || is_valid(tbe));
+      assert(is_valid(cache_entry));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.DataBlk := getDataBlock(address);
+      if (is_valid(tbe)) {
+        out_msg.Dirty := tbe.Dirty;
+      }
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(mc_cancelWB, "mc", desc="send writeback cancel to NB directory") {
+    enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:WrCancel;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.Requestor := machineID;
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+    }
+  }
+
+ action(sCS_sendCollectiveResponseS, "sCS", desc="send shared response to all merged TCP/SQC") {
+      enqueue(responseToCore_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := tbe.Sender;
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.CtoD := false;
+        out_msg.State := CoherenceState:Shared;
+        out_msg.Destination.addNetDest(cache_entry.MergedSharers);
+        out_msg.Shared := tbe.Shared;
+        out_msg.Dirty := tbe.Dirty;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+  }
+
+ action(sS_sendResponseS, "sS", desc="send shared response to TCP/SQC") {
+      enqueue(responseToCore_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := tbe.Sender;
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.CtoD := false;
+        out_msg.State := CoherenceState:Shared;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.Shared := tbe.Shared;
+        out_msg.Dirty := tbe.Dirty;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+  }
+
+ action(sM_sendResponseM, "sM", desc="send response to TCP/SQC") {
+      enqueue(responseToCore_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := tbe.Sender;
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.CtoD := false;
+        out_msg.State := CoherenceState:Modified;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.Shared := tbe.Shared;
+        out_msg.Dirty := tbe.Dirty;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+  }
+
+
+
+ action(fw2_forwardWBAck, "fw2", desc="forward WBAck to TCC") {
+    peek(responseFromNB_in, ResponseMsg) {
+      if(tbe.OriginalRequestor != machineID) {
+        enqueue(w_respTCC_out, ResponseMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:TDSysWBAck;
+          out_msg.Sender := machineID;
+          //out_msg.DataBlk := tbe.DataBlk;
+          out_msg.Destination.add(tbe.OriginalRequestor);
+          out_msg.MessageSize := in_msg.MessageSize;
+        }
+      }
+    }
+  }
+
+ action(sa_saveSysAck, "sa", desc="Save SysAck ") {
+    peek(responseFromNB_in, ResponseMsg) {
+        tbe.Dirty := in_msg.Dirty;
+        if (tbe.Dirty == false) {
+           tbe.DataBlk := in_msg.DataBlk;
+        }
+        else {
+           tbe.DataBlk := tbe.DataBlk;
+        }
+        tbe.CtoD := in_msg.CtoD;
+        tbe.CohState := in_msg.State;
+        tbe.Shared := in_msg.Shared;
+        tbe.MessageSize := in_msg.MessageSize;
+    }
+  }
+
+ action(fsa_forwardSavedAck, "fsa", desc="forward saved SysAck to TCP or SQC") {
+      enqueue(responseToCore_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := machineID;
+        if (tbe.Dirty == false) {
+           out_msg.DataBlk := tbe.DataBlk;
+        }
+        else {
+           out_msg.DataBlk := tbe.DataBlk;
+        }
+        out_msg.CtoD := tbe.CtoD;
+        out_msg.State := tbe.CohState;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.Shared := tbe.Shared;
+        out_msg.MessageSize := tbe.MessageSize;
+        out_msg.Dirty := tbe.Dirty;
+        out_msg.Sender := tbe.Sender;
+      }
+  }
+
+ action(fa_forwardSysAck, "fa", desc="forward SysAck to TCP or SQC") {
+    peek(responseFromNB_in, ResponseMsg) {
+      enqueue(responseToCore_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := machineID;
+        if (tbe.Dirty == false) {
+           out_msg.DataBlk := in_msg.DataBlk;
+           tbe.Sender := machineID;
+        }
+        else {
+           out_msg.DataBlk := tbe.DataBlk;
+        }
+        out_msg.CtoD := in_msg.CtoD;
+        out_msg.State := in_msg.State;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.Shared := in_msg.Shared;
+        out_msg.MessageSize := in_msg.MessageSize;
+        out_msg.Dirty := in_msg.Dirty;
+        out_msg.Sender := tbe.Sender;
+        DPRINTF(RubySlicc, "%s\n", (out_msg.DataBlk));
+      }
+    }
+  }
+
+ action(pso_probeSharedDataOwner, "pso", desc="probe shared data at owner") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+    if (cache_entry.Owner.isElement(tcc)) {
+      enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbDowngrade;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination.add(tcc);
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+    else { // i.e., owner is a core
+      enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbDowngrade;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination.addNetDest(cache_entry.Owner);
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+    tbe.NumPendingAcks := 1;
+  }
+
+  action(i_popIncomingRequestQueue, "i", desc="Pop incoming request queue") {
+    coreRequestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(j_popIncomingUnblockQueue, "j", desc="Pop incoming unblock queue") {
+    unblockNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pk_popResponseQueue, "pk", desc="Pop response queue") {
+    responseNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="Pop incoming probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pR_popResponseFromNBQueue, "pR", desc="Pop incoming Response queue From NB") {
+    responseFromNB_in.dequeue(clockEdge());
+  }
+
+  action(pt_popTriggerQueue, "pt", desc="pop trigger queue") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(pl_popTCCRequestQueue, "pl", desc="pop TCC request queue") {
+    w_TCCRequest_in.dequeue(clockEdge());
+  }
+
+  action(plr_popTCCResponseQueue, "plr", desc="pop TCC response queue") {
+    w_TCCResponse_in.dequeue(clockEdge());
+  }
+
+  action(plu_popTCCUnblockQueue, "plu", desc="pop TCC unblock queue") {
+    w_TCCUnblock_in.dequeue(clockEdge());
+  }
+
+
+  action(m_addUnlockerToSharers, "m", desc="Add the unlocker to the sharer list") {
+    peek(unblockNetwork_in, UnblockMsg) {
+      cache_entry.Sharers.add(in_msg.Sender);
+      cache_entry.MergedSharers.remove(in_msg.Sender);
+      assert(cache_entry.WaitingUnblocks >= 0);
+      cache_entry.WaitingUnblocks := cache_entry.WaitingUnblocks - 1;
+    }
+  }
+
+  action(q_addOutstandingMergedSharer, "q", desc="Increment outstanding requests") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      cache_entry.MergedSharers.add(in_msg.Requestor);
+      cache_entry.WaitingUnblocks := cache_entry.WaitingUnblocks + 1;
+    }
+  }
+
+  action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+    enqueue(unblockToNB_out, UnblockMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(zz_recycleRequest, "\z", desc="Recycle the request queue") {
+    coreRequestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(yy_recycleTCCRequestQueue, "yy", desc="recycle yy request queue") {
+    w_TCCRequest_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(xz_recycleResponseQueue, "xz", desc="recycle response queue") {
+    responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(xx_recycleTCCResponseQueue, "xx", desc="recycle TCC response queue") {
+    w_TCCResponse_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(vv_recycleTCCUnblockQueue, "vv", desc="Recycle the probe request queue") {
+    w_TCCUnblock_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(xy_recycleUnblockQueue, "xy", desc="Recycle the probe request queue") {
+    w_TCCUnblock_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(ww_recycleProbeRequest, "ww", desc="Recycle the probe request queue") {
+    probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(x_decrementAcks, "x", desc="decrement Acks pending") {
+    tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+  }
+
+  action(o_checkForAckCompletion, "o", desc="check for ack completion") {
+    if (tbe.NumPendingAcks == 0) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:AcksComplete;
+      }
+    }
+    APPEND_TRANSITION_COMMENT(" tbe acks ");
+    APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+  }
+
+  action(tp_allocateTBE, "tp", desc="allocate TBE Entry for upward transactions") {
+    check_allocate(TBEs);
+    peek(probeNetwork_in, NBProbeRequestMsg) {
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      tbe.Dirty := false;
+      tbe.NumPendingAcks := 0;
+      tbe.UntransferredOwnerExists := false;
+    }
+  }
+
+  action(tv_allocateTBE, "tv", desc="allocate TBE Entry for TCC transactions") {
+      check_allocate(TBEs);
+    peek(w_TCCRequest_in, CPURequestMsg) {
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      tbe.DataBlk := in_msg.DataBlk; // Data only for WBs
+      tbe.Dirty := false;
+      tbe.OriginalRequestor := in_msg.Requestor;
+      tbe.NumPendingAcks := 0;
+      tbe.UntransferredOwnerExists := false;
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+      check_allocate(TBEs);//check whether resources are full
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs
+      tbe.Dirty := false;
+      tbe.Upgrade := false;
+      tbe.OriginalRequestor := in_msg.Requestor;
+      tbe.NumPendingAcks := 0;
+      tbe.UntransferredOwnerExists := false;
+      tbe.Sender := machineID;
+    }
+  }
+
+  action(tr_allocateTBE, "tr", desc="allocate TBE Entry for recall") {
+      check_allocate(TBEs);//check whether resources are full
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs
+      tbe.Dirty := false;
+      tbe.Upgrade := false;
+      tbe.OriginalRequestor := machineID; //Recall request, Self initiated
+      tbe.NumPendingAcks := 0;
+      tbe.UntransferredOwnerExists := false;
+  }
+
+  action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+
+  action(d_allocateDir, "d", desc="allocate Directory Cache") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(directory.allocate(address, new Entry));
+    }
+  }
+
+  action(dd_deallocateDir, "dd", desc="deallocate Directory Cache") {
+    if (is_valid(cache_entry)) {
+        directory.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") {
+     enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+         out_msg.addr := address;
+         out_msg.Type := CoherenceResponseType:StaleNotif;
+         out_msg.Destination.add(map_Address_to_Directory(address));
+         out_msg.Sender := machineID;
+         out_msg.MessageSize := MessageSizeType:Response_Control;
+     }
+  }
+
+  action(wb_data, "wb", desc="write back data") {
+    enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUData;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Shared) {
+        out_msg.NbReqShared := true;
+      } else {
+        out_msg.NbReqShared := false;
+      }
+      out_msg.State := CoherenceState:Shared; // faux info
+      out_msg.MessageSize := MessageSizeType:Writeback_Data;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(sf_setSharedFlip, "sf", desc="hit by shared probe, status may be different") {
+    assert(is_valid(tbe));
+    tbe.Shared := true;
+  }
+
+  action(y_writeDataToTBE, "y", desc="write Probe Data to TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (!tbe.Dirty || in_msg.Dirty) {
+        tbe.DataBlk := in_msg.DataBlk;
+        tbe.Dirty := in_msg.Dirty;
+      }
+      if (in_msg.Hit) {
+        tbe.Cached := true;
+      }
+    }
+  }
+
+  action(ty_writeTCCDataToTBE, "ty", desc="write TCC Probe Data to TBE") {
+    peek(w_TCCResponse_in, ResponseMsg) {
+      if (!tbe.Dirty || in_msg.Dirty) {
+        tbe.DataBlk := in_msg.DataBlk;
+        tbe.Dirty := in_msg.Dirty;
+      }
+      if (in_msg.Hit) {
+        tbe.Cached := true;
+      }
+    }
+  }
+
+
+  action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
+    directory.setMRU(address);
+  }
+
+  // TRANSITIONS
+
+  // Handling TCP/SQC requests (similar to how NB dir handles TCC events with some changes to account for stateful directory).
+
+
+  // transitions from base
+  transition(I, RdBlk, I_ES){TagArrayRead} {
+    d_allocateDir;
+    t_allocateTBE;
+    n_issueRdBlk;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(I, RdBlkS, I_S){TagArrayRead} {
+    d_allocateDir;
+    t_allocateTBE;
+    nS_issueRdBlkS;
+    i_popIncomingRequestQueue;
+  }
+
+
+  transition(I_S, NB_AckS, BBB_S) {
+    fa_forwardSysAck;
+    pR_popResponseFromNBQueue;
+  }
+
+  transition(I_ES, NB_AckS, BBB_S) {
+    fa_forwardSysAck;
+    pR_popResponseFromNBQueue;
+  }
+
+ transition(I_ES, NB_AckE, BBB_E) {
+    fa_forwardSysAck;
+    pR_popResponseFromNBQueue;
+  }
+
+  transition({S_M, O_M}, {NB_AckCtoD,NB_AckM}, BBB_M) {
+    fa_forwardSysAck;
+    pR_popResponseFromNBQueue;
+  }
+
+  transition(I_M, NB_AckM, BBB_M) {
+    fa_forwardSysAck;
+    pR_popResponseFromNBQueue;
+  }
+
+  transition(BBB_M, CoreUnblock, M){TagArrayWrite} {
+    c_clearOwner;
+    cc_clearSharers;
+    e_ownerIsUnblocker;
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    j_popIncomingUnblockQueue;
+  }
+
+  transition(BBB_S, CoreUnblock, S){TagArrayWrite}  {
+    as_addToSharers;
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    j_popIncomingUnblockQueue;
+  }
+
+  transition(BBB_E, CoreUnblock, E){TagArrayWrite}  {
+    as_addToSharers;
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    j_popIncomingUnblockQueue;
+  }
+
+
+  transition(I, RdBlkM, I_M){TagArrayRead}  {
+    d_allocateDir;
+    t_allocateTBE;
+    nM_issueRdBlkM;
+    i_popIncomingRequestQueue;
+  }
+
+  //
+  transition(S, {RdBlk, RdBlkS}, BBS_S){TagArrayRead} {
+    t_allocateTBE;
+    sc_probeShrCoreData;
+    s2_probeShrL2Data;
+    q_addOutstandingMergedSharer;
+    i_popIncomingRequestQueue;
+  }
+  // Merging of read sharing into a single request
+  transition(BBS_S, {RdBlk, RdBlkS}) {
+    q_addOutstandingMergedSharer;
+    i_popIncomingRequestQueue;
+  }
+  // Wait for probe acks to be complete
+  transition(BBS_S, CPUPrbResp) {
+    ccr_copyCoreResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  transition(BBS_S, TCCPrbResp) {
+    ctr_copyTCCResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+
+  // Window for merging complete with this transition
+  // Send responses to all outstanding
+  transition(BBS_S, ProbeAcksComplete, BB_S) {
+    sCS_sendCollectiveResponseS;
+    pt_popTriggerQueue;
+  }
+
+  transition(BB_S, CoreUnblock, BB_S) {
+    m_addUnlockerToSharers;
+    j_popIncomingUnblockQueue;
+  }
+
+  transition(BB_S, LastCoreUnblock, S) {
+    m_addUnlockerToSharers;
+    dt_deallocateTBE;
+    j_popIncomingUnblockQueue;
+  }
+
+  transition(O, {RdBlk, RdBlkS}, BBO_O){TagArrayRead} {
+    t_allocateTBE;
+    pso_probeSharedDataOwner;
+    q_addOutstandingMergedSharer;
+    i_popIncomingRequestQueue;
+  }
+  // Merging of read sharing into a single request
+  transition(BBO_O, {RdBlk, RdBlkS}) {
+    q_addOutstandingMergedSharer;
+    i_popIncomingRequestQueue;
+  }
+
+  // Wait for probe acks to be complete
+  transition(BBO_O, CPUPrbResp) {
+    ccr_copyCoreResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  transition(BBO_O, TCCPrbResp) {
+    ctr_copyTCCResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+
+  // Window for merging complete with this transition
+  // Send responses to all outstanding
+  transition(BBO_O, ProbeAcksComplete, BB_OO) {
+    sCS_sendCollectiveResponseS;
+    pt_popTriggerQueue;
+  }
+
+  transition(BB_OO, CoreUnblock) {
+    m_addUnlockerToSharers;
+    j_popIncomingUnblockQueue;
+  }
+
+  transition(BB_OO, LastCoreUnblock, O){TagArrayWrite} {
+    m_addUnlockerToSharers;
+    dt_deallocateTBE;
+    j_popIncomingUnblockQueue;
+  }
+
+  transition(S, CPUWrite, BW_S){TagArrayRead} {
+    t_allocateTBE;
+    rC_removeCoreFromSharers;
+    sT_sendRequestToTCC;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(E, CPUWrite, BW_E){TagArrayRead} {
+    t_allocateTBE;
+    rC_removeCoreFromSharers;
+    sT_sendRequestToTCC;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(O, CPUWrite, BW_O){TagArrayRead} {
+    t_allocateTBE;
+    rCo_removeCoreFromOwner;
+    rC_removeCoreFromSharers;
+    sT_sendRequestToTCC;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(M, CPUWrite, BW_M){TagArrayRead} {
+    t_allocateTBE;
+    rCo_removeCoreFromOwner;
+    rC_removeCoreFromSharers;
+    sT_sendRequestToTCC;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(BW_S, TCCUnblock_Sharer, S){TagArrayWrite} {
+    aT_addTCCToSharers;
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  transition(BW_S, TCCUnblock_NotValid, S){TagArrayWrite} {
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  transition(BW_E, TCCUnblock, E){TagArrayWrite} {
+    cc_clearSharers;
+    aT_addTCCToSharers;
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  transition(BW_E, TCCUnblock_NotValid, E) {
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  transition(BW_M, TCCUnblock, M) {
+    c_clearOwner;
+    cc_clearSharers;
+    eT_ownerIsUnblocker;
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  transition(BW_M, TCCUnblock_NotValid, M) {
+    // Note this transition should only be executed if we received a stale wb
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  transition(BW_O, TCCUnblock, O) {
+    c_clearOwner;
+    eT_ownerIsUnblocker;
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  transition(BW_O, TCCUnblock_NotValid, O) {
+    // Note this transition should only be executed if we received a stale wb
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  // We lost the owner likely do to an invalidation racing with a 'O' wb
+  transition(BW_O, TCCUnblock_Sharer, S) {
+    c_clearOwner;
+    aT_addTCCToSharers;
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  transition({BW_M, BW_S, BW_E, BW_O}, {PrbInv,PrbInvData,PrbShrData}) {
+    ww_recycleProbeRequest;
+  }
+
+  transition(BRWD_I, {PrbInvData, PrbInv, PrbShrData}) {
+    ww_recycleProbeRequest;
+  }
+
+  // Three step process: locally invalidate others, issue CtoD, wait for NB_AckCtoD
+  transition(S, CtoD, BBS_UM) {TagArrayRead} {
+    t_allocateTBE;
+    lpc_probeInvCore;
+    i2_probeInvL2;
+    o_checkForAckCompletion;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(BBS_UM, CPUPrbResp, BBS_UM) {
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  transition(BBS_UM, TCCPrbResp) {
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+
+  transition(BBS_UM, ProbeAcksComplete, S_M) {
+    rU_rememberUpgrade;
+    nM_issueRdBlkM;
+    pt_popTriggerQueue;
+  }
+
+  // Three step process: locally invalidate others, issue CtoD, wait for NB_AckCtoD
+  transition(O, CtoD, BBO_UM){TagArrayRead} {
+    t_allocateTBE;
+    lpc_probeInvCore;
+    i2_probeInvL2;
+    o_checkForAckCompletion;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(BBO_UM, CPUPrbResp, BBO_UM) {
+    ruo_rememberUntransferredOwner;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  transition(BBO_UM, TCCPrbResp) {
+    ruoT_rememberUntransferredOwnerTCC;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+
+  transition(BBO_UM, ProbeAcksComplete, O_M) {
+    rU_rememberUpgrade;
+    nM_issueRdBlkM;
+    pt_popTriggerQueue;
+  }
+
+  transition({S,E}, RdBlkM, BBS_M){TagArrayWrite} {
+    t_allocateTBE;
+    ldc_probeInvCoreData;
+    ld2_probeInvL2Data;
+    o_checkForAckCompletion;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(BBS_M, CPUPrbResp) {
+    ccr_copyCoreResponseToTBE;
+    rR_removeResponderFromSharers;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  transition(BBS_M, TCCPrbResp) {
+    ctr_copyTCCResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+
+  transition(BBS_M, ProbeAcksComplete, S_M) {
+    nM_issueRdBlkM;
+    pt_popTriggerQueue;
+  }
+
+  transition(O, RdBlkM, BBO_M){TagArrayRead} {
+    t_allocateTBE;
+    ldc_probeInvCoreData;
+    ld2_probeInvL2Data;
+    o_checkForAckCompletion;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(BBO_M, CPUPrbResp) {
+    ccr_copyCoreResponseToTBE;
+    rR_removeResponderFromSharers;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  transition(BBO_M, TCCPrbResp) {
+    ctr_copyTCCResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+
+  transition(BBO_M, ProbeAcksComplete, O_M) {
+    nM_issueRdBlkM;
+    pt_popTriggerQueue;
+  }
+
+  //
+  transition(M, RdBlkM, BBM_M){TagArrayRead} {
+    t_allocateTBE;
+    ldc_probeInvCoreData;
+    ld2_probeInvL2Data;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(BBM_M, CPUPrbResp) {
+    ccr_copyCoreResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  // TCP recalled block before receiving probe
+  transition({BBM_M, BBS_M, BBO_M}, {CPUWrite,NoCPUWrite}) {
+    zz_recycleRequest;
+  }
+
+  transition(BBM_M, TCCPrbResp) {
+    ctr_copyTCCResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+
+  transition(BBM_M, ProbeAcksComplete, BB_M) {
+    sM_sendResponseM;
+    pt_popTriggerQueue;
+  }
+
+  transition(BB_M, CoreUnblock, M){TagArrayWrite} {
+    e_ownerIsUnblocker;
+    dt_deallocateTBE;
+    j_popIncomingUnblockQueue;
+  }
+
+  transition(M, {RdBlkS, RdBlk}, BBM_O){TagArrayRead} {
+    t_allocateTBE;
+    sc_probeShrCoreData;
+    s2_probeShrL2Data;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(E, {RdBlkS, RdBlk}, BBM_O){TagArrayRead} {
+    t_allocateTBE;
+    eto_moveExSharerToOwner;
+    sc_probeShrCoreData;
+    s2_probeShrL2Data;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(BBM_O, CPUPrbResp) {
+    ccr_copyCoreResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  transition(BBM_O, TCCPrbResp) {
+    ctr_copyTCCResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  transition(BBM_O, ProbeAcksComplete, BB_O) {
+    sS_sendResponseS;
+    pt_popTriggerQueue;
+  }
+
+  transition(BB_O, CoreUnblock, O){TagArrayWrite} {
+    as_addToSharers;
+    dt_deallocateTBE;
+    j_popIncomingUnblockQueue;
+  }
+
+  transition({BBO_O, BBM_M, BBS_S, BBM_O, BB_M, BB_O, BB_S, BBO_UM, BBS_UM, BBS_M, BBO_M, BB_OO}, {PrbInvData, PrbInv,PrbShrData}) {
+    ww_recycleProbeRequest;
+  }
+
+  transition({BBM_O, BBS_S, CP_S, CP_O, CP_SM, CP_OM, BBO_O}, {CPUWrite,NoCPUWrite}) {
+    zz_recycleRequest;
+  }
+
+  // stale CtoD raced with external invalidation
+  transition({I, CP_I, B_I, CP_IOM, CP_ISM, CP_OSIW, BRWD_I, BRW_I, BRD_I}, CtoD) {
+    i_popIncomingRequestQueue;
+  }
+
+  // stale CtoD raced with internal RdBlkM
+  transition({BBM_M, BBS_M, BBO_M, BBB_M, BBS_UM, BBO_UM}, CtoD) {
+    i_popIncomingRequestQueue;
+  }
+
+  transition({E, M}, CtoD) {
+    i_popIncomingRequestQueue;
+  }
+
+
+  // TCC-directory has sent out (And potentially received acks for) probes.
+  // TCP/SQC replacement (known to be stale subsequent) are popped off.
+  transition({BBO_UM, BBS_UM}, {CPUWrite,NoCPUWrite}) {
+    nC_sendNullWBAckToCore;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(S_M, {NoCPUWrite, CPUWrite}) {
+    zz_recycleRequest;
+  }
+
+  transition(O_M, {NoCPUWrite, CPUWrite}) {
+    zz_recycleRequest;
+  }
+
+
+  transition({BBM_M, BBS_M, BBO_M, BBO_UM, BBS_UM}, {VicDirty, VicClean, VicDirtyLast, NoVic}) {
+    nT_sendNullWBAckToTCC;
+    pl_popTCCRequestQueue;
+  }
+
+  transition({CP_S, CP_O, CP_OM, CP_SM}, {VicDirty, VicClean, VicDirtyLast, CancelWB, NoVic}) {
+    yy_recycleTCCRequestQueue;
+  }
+
+  // However, when TCCdir has sent out PrbSharedData, one cannot ignore.
+  transition({BBS_S, BBO_O, BBM_O, S_M, O_M, BBB_M, BBB_S, BBB_E}, {VicDirty, VicClean, VicDirtyLast,CancelWB}) {
+    yy_recycleTCCRequestQueue;
+  }
+
+  transition({BW_S,BW_E,BW_O, BW_M}, {VicDirty, VicClean, VicDirtyLast, NoVic}) {
+    yy_recycleTCCRequestQueue;
+  }
+
+  transition({BW_S,BW_E,BW_O, BW_M}, CancelWB) {
+   nT_sendNullWBAckToTCC;
+   pl_popTCCRequestQueue;
+  }
+
+
+  /// recycle if waiting for unblocks.
+  transition({BB_M,BB_O,BB_S,BB_OO}, {VicDirty, VicClean, VicDirtyLast,NoVic,CancelWB}) {
+    yy_recycleTCCRequestQueue;
+  }
+
+  transition({BBS_S, BBO_O}, NoVic) {
+   rT_removeTCCFromSharers;
+   nT_sendNullWBAckToTCC;
+   pl_popTCCRequestQueue;
+  }
+
+  // stale. Pop message and send dummy ack.
+  transition({I_S, I_ES, I_M}, {VicDirty, VicClean, VicDirtyLast, NoVic}) {
+    nT_sendNullWBAckToTCC;
+    pl_popTCCRequestQueue;
+  }
+
+  transition(M,  VicDirtyLast, VM_I){TagArrayRead} {
+    tv_allocateTBE;
+    vd_victim;
+    pl_popTCCRequestQueue;
+  }
+
+  transition(E,  VicDirty, VM_I){TagArrayRead} {
+    tv_allocateTBE;
+    vd_victim;
+    pl_popTCCRequestQueue;
+  }
+
+  transition(O, VicDirty, VO_S){TagArrayRead} {
+    tv_allocateTBE;
+    vd_victim;
+    pl_popTCCRequestQueue;
+  }
+
+  transition(O, {VicDirtyLast, VicClean}, VO_I){TagArrayRead} {
+    tv_allocateTBE;
+    vd_victim;
+    pl_popTCCRequestQueue;
+  }
+
+  transition({E, S}, VicClean, VES_I){TagArrayRead} {
+    tv_allocateTBE;
+    vc_victim;
+    pl_popTCCRequestQueue;
+  }
+
+  transition({O, S}, NoVic){TagArrayRead} {
+    rT_removeTCCFromSharers;
+    nT_sendNullWBAckToTCC;
+    pl_popTCCRequestQueue;
+  }
+
+  transition({O,S}, NoCPUWrite){TagArrayRead} {
+    rC_removeCoreFromSharers;
+    nC_sendNullWBAckToCore;
+    i_popIncomingRequestQueue;
+  }
+
+  transition({M,E}, NoCPUWrite){TagArrayRead} {
+    rC_removeCoreFromSharers;
+    nC_sendNullWBAckToCore;
+    i_popIncomingRequestQueue;
+  }
+
+  // This can only happen if it is  race. (TCCdir sent out probes which caused this cancel in the first place.)
+  transition({VM_I, VES_I, VO_I}, CancelWB) {
+    pl_popTCCRequestQueue;
+  }
+
+  transition({VM_I, VES_I, VO_I}, NB_AckWB, I){TagArrayWrite} {
+    c_clearOwner;
+    cc_clearSharers;
+    wb_data;
+    fw2_forwardWBAck;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pR_popResponseFromNBQueue;
+  }
+
+  transition(VO_S, NB_AckWB, S){TagArrayWrite}  {
+    c_clearOwner;
+    wb_data;
+    fw2_forwardWBAck;
+    dt_deallocateTBE;
+    pR_popResponseFromNBQueue;
+  }
+
+  transition(I_C, NB_AckWB, I){TagArrayWrite}  {
+    c_clearOwner;
+    cc_clearSharers;
+    ss_sendStaleNotification;
+    fw2_forwardWBAck;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pR_popResponseFromNBQueue;
+  }
+
+  transition(I_W, NB_AckWB, I) {
+    ss_sendStaleNotification;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pR_popResponseFromNBQueue;
+  }
+
+
+
+  // Do not handle replacements, reads of any kind or writebacks from transients; recycle
+  transition({I_M, I_ES, I_S, MO_I, ES_I, S_M, O_M, VES_I, VO_I, VO_S, VM_I, I_C, I_W}, {RdBlkS,RdBlkM,RdBlk,CtoD}) {
+    zz_recycleRequest;
+  }
+
+  transition( VO_S, NoCPUWrite) {
+    zz_recycleRequest;
+  }
+
+  transition({BW_M, BW_S, BW_O, BW_E}, {RdBlkS,RdBlkM,RdBlk,CtoD,NoCPUWrite, CPUWrite}) {
+    zz_recycleRequest;
+  }
+
+  transition({BBB_M, BBB_S, BBB_E, BB_O, BB_M, BB_S, BB_OO}, { RdBlk, RdBlkS, RdBlkM, CPUWrite, NoCPUWrite}) {
+    zz_recycleRequest;
+  }
+
+  transition({BBB_S, BBB_E, BB_O, BB_S, BB_OO}, { CtoD}) {
+    zz_recycleRequest;
+  }
+
+  transition({BBS_UM, BBO_UM, BBM_M, BBM_O, BBS_M, BBO_M}, { RdBlk, RdBlkS, RdBlkM}) {
+    zz_recycleRequest;
+  }
+
+  transition(BBM_O, CtoD) {
+    zz_recycleRequest;
+  }
+
+  transition({BBS_S, BBO_O}, {RdBlkM, CtoD}) {
+    zz_recycleRequest;
+  }
+
+  transition({B_I, CP_I, CP_S, CP_O, CP_OM, CP_SM, CP_IOM, CP_ISM, CP_OSIW, BRWD_I, BRW_I, BRD_I}, {RdBlk, RdBlkS, RdBlkM}) {
+    zz_recycleRequest;
+  }
+
+  transition({CP_O, CP_S, CP_OM}, CtoD) {
+    zz_recycleRequest;
+  }
+
+  // Ignore replacement related messages after probe got in.
+  transition({CP_I, B_I, CP_IOM, CP_ISM, CP_OSIW, BRWD_I, BRW_I, BRD_I}, {CPUWrite, NoCPUWrite}) {
+    zz_recycleRequest;
+  }
+
+  // Ignore replacement related messages after probes processed
+  transition({I, I_S, I_ES, I_M, I_C, I_W}, {CPUWrite,NoCPUWrite}) {
+      nC_sendNullWBAckToCore;
+    i_popIncomingRequestQueue;
+  }
+  // cannot ignore cancel... otherwise TCP/SQC will be stuck in I_C
+  transition({I, I_S, I_ES, I_M, I_C, I_W, S_M, M, O, E, S}, CPUWriteCancel){TagArrayRead}  {
+    nC_sendNullWBAckToCore;
+    i_popIncomingRequestQueue;
+  }
+
+  transition({CP_I, B_I, CP_IOM, CP_ISM, BRWD_I, BRW_I, BRD_I}, {NoVic, VicClean, VicDirty, VicDirtyLast}){
+    nT_sendNullWBAckToTCC;
+    pl_popTCCRequestQueue;
+  }
+
+  // Handling Probes from NB (General process: (1) propagate up, go to blocking state (2) process acks (3) on last ack downward.)
+
+  // step 1
+  transition({M, O, E, S}, PrbInvData, CP_I){TagArrayRead} {
+    tp_allocateTBE;
+    dc_probeInvCoreData;
+    d2_probeInvL2Data;
+    pp_popProbeQueue;
+  }
+  // step 2a
+  transition(CP_I, CPUPrbResp) {
+    y_writeDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  // step 2b
+  transition(CP_I, TCCPrbResp) {
+    ty_writeTCCDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  // step 3
+  transition(CP_I, ProbeAcksComplete, I){TagArrayWrite} {
+    pd_sendProbeResponseData;
+    c_clearOwner;
+    cc_clearSharers;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pt_popTriggerQueue;
+  }
+
+  // step 1
+  transition({M, O, E, S}, PrbInv, B_I){TagArrayWrite} {
+    tp_allocateTBE;
+    ipc_probeInvCore;
+    i2_probeInvL2;
+    pp_popProbeQueue;
+  }
+  // step 2
+  transition(B_I, CPUPrbResp) {
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  // step 2b
+  transition(B_I, TCCPrbResp) {
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  // step 3
+  transition(B_I, ProbeAcksComplete, I){TagArrayWrite} {
+    // send response down to NB
+    pi_sendProbeResponseInv;
+    c_clearOwner;
+    cc_clearSharers;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pt_popTriggerQueue;
+  }
+
+
+  // step 1
+  transition({M, O}, PrbShrData, CP_O){TagArrayRead} {
+    tp_allocateTBE;
+    sc_probeShrCoreData;
+    s2_probeShrL2Data;
+    pp_popProbeQueue;
+  }
+
+  transition(E, PrbShrData, CP_O){TagArrayRead} {
+    tp_allocateTBE;
+    eto_moveExSharerToOwner;
+    sc_probeShrCoreData;
+    s2_probeShrL2Data;
+    pp_popProbeQueue;
+  }
+  // step 2
+  transition(CP_O, CPUPrbResp) {
+    y_writeDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  // step 2b
+  transition(CP_O, TCCPrbResp) {
+    ty_writeTCCDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  // step 3
+  transition(CP_O, ProbeAcksComplete, O){TagArrayWrite} {
+    // send response down to NB
+    pd_sendProbeResponseData;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  //step 1
+  transition(S, PrbShrData, CP_S) {
+    tp_allocateTBE;
+    sc_probeShrCoreData;
+    s2_probeShrL2Data;
+    pp_popProbeQueue;
+  }
+  // step 2
+  transition(CP_S, CPUPrbResp) {
+    y_writeDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  // step 2b
+  transition(CP_S, TCCPrbResp) {
+    ty_writeTCCDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  // step 3
+  transition(CP_S, ProbeAcksComplete, S) {
+    // send response down to NB
+    pd_sendProbeResponseData;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  // step 1
+  transition(O_M, PrbInvData, CP_IOM) {
+    dc_probeInvCoreData;
+    d2_probeInvL2Data;
+    pp_popProbeQueue;
+  }
+  // step 2a
+  transition(CP_IOM, CPUPrbResp) {
+    y_writeDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  // step 2b
+  transition(CP_IOM, TCCPrbResp) {
+    ty_writeTCCDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  // step 3
+  transition(CP_IOM, ProbeAcksComplete, I_M) {
+    pdm_sendProbeResponseDataMs;
+    c_clearOwner;
+    cc_clearSharers;
+    cd_clearDirtyBitTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(CP_IOM, ProbeAcksCompleteReissue, I){TagArrayWrite} {
+    pdm_sendProbeResponseDataMs;
+    c_clearOwner;
+    cc_clearSharers;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pt_popTriggerQueue;
+  }
+
+  // step 1
+  transition(S_M, PrbInvData, CP_ISM) {
+    dc_probeInvCoreData;
+    d2_probeInvL2Data;
+    o_checkForAckCompletion;
+    pp_popProbeQueue;
+  }
+  // step 2a
+  transition(CP_ISM, CPUPrbResp) {
+    y_writeDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  // step 2b
+  transition(CP_ISM, TCCPrbResp) {
+    ty_writeTCCDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  // step 3
+  transition(CP_ISM, ProbeAcksComplete, I_M) {
+    pdm_sendProbeResponseDataMs;
+    c_clearOwner;
+    cc_clearSharers;
+    cd_clearDirtyBitTBE;
+
+    //dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+  transition(CP_ISM, ProbeAcksCompleteReissue, I){TagArrayWrite} {
+    pim_sendProbeResponseInvMs;
+    c_clearOwner;
+    cc_clearSharers;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pt_popTriggerQueue;
+  }
+
+  // step 1
+  transition({S_M, O_M}, {PrbInv}, CP_ISM) {
+    dc_probeInvCoreData;
+    d2_probeInvL2Data;
+    pp_popProbeQueue;
+  }
+  // next steps inherited from BS_ISM
+
+  // Simpler cases
+
+  transition({I_C, I_W}, {PrbInvData, PrbInv, PrbShrData}) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  //If the directory is certain that the block is not present, one can send an acknowledgement right away.
+  // No need for three step process.
+  transition(I, {PrbInv,PrbShrData,PrbInvData}){TagArrayRead} {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({I_M, I_ES, I_S}, {PrbInv, PrbInvData}) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({I_M, I_ES, I_S}, PrbShrData) {
+    prm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  //step 1
+  transition(S_M, PrbShrData, CP_SM) {
+    sc_probeShrCoreData;
+    s2_probeShrL2Data;
+    o_checkForAckCompletion;
+    pp_popProbeQueue;
+  }
+  // step 2
+  transition(CP_SM, CPUPrbResp) {
+    y_writeDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  // step 2b
+  transition(CP_SM, TCCPrbResp) {
+    ty_writeTCCDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  // step 3
+  transition(CP_SM, {ProbeAcksComplete,ProbeAcksCompleteReissue}, S_M){DataArrayRead} {
+    // send response down to NB
+    pd_sendProbeResponseData;
+    pt_popTriggerQueue;
+  }
+
+  //step 1
+  transition(O_M, PrbShrData, CP_OM) {
+    sc_probeShrCoreData;
+    s2_probeShrL2Data;
+    pp_popProbeQueue;
+  }
+  // step 2
+  transition(CP_OM, CPUPrbResp) {
+    y_writeDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  // step 2b
+  transition(CP_OM, TCCPrbResp) {
+    ty_writeTCCDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  // step 3
+  transition(CP_OM, {ProbeAcksComplete,ProbeAcksCompleteReissue}, O_M) {
+    // send response down to NB
+    pd_sendProbeResponseData;
+    pt_popTriggerQueue;
+  }
+
+  transition(BRW_I, PrbInvData, I_W) {
+     pd_sendProbeResponseData;
+     pp_popProbeQueue;
+   }
+
+  transition({VM_I,VO_I}, PrbInvData, I_C) {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition(VES_I, {PrbInvData,PrbInv}, I_C) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({VM_I, VO_I, BRW_I}, PrbInv, I_W) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({VM_I, VO_I, VO_S, VES_I, BRW_I}, PrbShrData) {
+    pd_sendProbeResponseData;
+    sf_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+  transition(VO_S, PrbInvData, CP_OSIW) {
+    dc_probeInvCoreData;
+    d2_probeInvL2Data;
+    pp_popProbeQueue;
+  }
+
+  transition(CP_OSIW, TCCPrbResp) {
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  transition(CP_OSIW, CPUPrbResp) {
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  transition(CP_OSIW, ProbeAcksComplete, I_C) {
+    pd_sendProbeResponseData;
+    cd_clearDirtyBitTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition({I, S, E, O, M, CP_O, CP_S, CP_OM, CP_SM, CP_OSIW, BW_S, BW_E, BW_O, BW_M, I_M, I_ES, I_S, BBS_S, BBO_O, BBM_M, BBM_O, BB_M, BB_O, BB_OO, BB_S, BBS_M, BBO_M, BBO_UM, BBS_UM, S_M, O_M, BBB_S, BBB_M, BBB_E, VES_I, VM_I, VO_I, VO_S, ES_I, MO_I, I_C, I_W}, StaleVic) {
+      nT_sendNullWBAckToTCC;
+      pl_popTCCRequestQueue;
+  }
+
+  transition({CP_I, B_I, CP_IOM, CP_ISM, BRWD_I, BRW_I, BRD_I}, StaleVic) {
+      nT_sendNullWBAckToTCC;
+      pl_popTCCRequestQueue;
+  }
+
+  // Recall Transistions
+  // transient states still require the directory state
+  transition({M, O}, Recall, BRWD_I) {
+    tr_allocateTBE;
+    vd_victim;
+    dc_probeInvCoreData;
+    d2_probeInvL2Data;
+  }
+
+  transition({E, S}, Recall, BRWD_I) {
+    tr_allocateTBE;
+    vc_victim;
+    dc_probeInvCoreData;
+    d2_probeInvL2Data;
+  }
+
+  transition(I, Recall) {
+    dd_deallocateDir;
+  }
+
+  transition({BRWD_I, BRD_I}, CPUPrbResp) {
+    y_writeDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  transition({BRWD_I, BRD_I}, TCCPrbResp) {
+    ty_writeTCCDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+
+  transition(BRWD_I, NB_AckWB, BRD_I) {
+    pR_popResponseFromNBQueue;
+  }
+
+  transition(BRWD_I, ProbeAcksComplete, BRW_I) {
+    pt_popTriggerQueue;
+  }
+
+  transition(BRW_I, NB_AckWB, I) {
+    wb_data;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pR_popResponseFromNBQueue;
+  }
+
+  transition(BRD_I, ProbeAcksComplete, I) {
+    wb_data;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pt_popTriggerQueue;
+  }
+
+  // wait for stable state for Recall
+  transition({BRWD_I,BRD_I,BRW_I,CP_O, CP_S, CP_OM, CP_SM, CP_OSIW, BW_S, BW_E, BW_O, BW_M, I_M, I_ES, I_S, BBS_S, BBO_O, BBM_M, BBM_O, BB_M, BB_O, BB_OO, BB_S, BBS_M, BBO_M, BBO_UM, BBS_UM, S_M, O_M, BBB_S, BBB_M, BBB_E, VES_I, VM_I, VO_I, VO_S, ES_I, MO_I, I_C, I_W, CP_I}, Recall) {
+    zz_recycleRequest; // stall and wait would be for the wrong address
+    ut_updateTag; // try to find an easier recall
+  }
+
+}
diff --git a/src/mem/protocol/GPU_RfO-TCP.sm b/src/mem/protocol/GPU_RfO-TCP.sm
new file mode 100644
index 000000000..6cf9224a6
--- /dev/null
+++ b/src/mem/protocol/GPU_RfO-TCP.sm
@@ -0,0 +1,1009 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
+ : GPUCoalescer* coalescer;
+   Sequencer* sequencer;
+   bool use_seq_not_coal;
+   CacheMemory * L1cache;
+   int TCC_select_num_bits;
+   Cycles issue_latency := 40;  // time to send data down to TCC
+   Cycles l2_hit_latency := 18;
+
+  MessageBuffer * requestFromTCP, network="To", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseFromTCP, network="To", virtual_network="3", vnet_type="response";
+  MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock";
+
+  MessageBuffer * probeToTCP, network="From", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseToTCP, network="From", virtual_network="3", vnet_type="response";
+
+  MessageBuffer * mandatoryQueue;
+{
+  state_declaration(State, desc="TCP Cache States", default="TCP_State_I") {
+    I, AccessPermission:Invalid, desc="Invalid";
+    S, AccessPermission:Read_Only, desc="Shared";
+    E, AccessPermission:Read_Write, desc="Exclusive";
+    O, AccessPermission:Read_Only, desc="Owner state in core, both clusters and other cores may be sharing line";
+    M, AccessPermission:Read_Write, desc="Modified";
+
+    I_M, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet";
+    I_ES, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet";
+    S_M, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    O_M, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+
+    ES_I, AccessPermission:Read_Only, desc="L1 replacement, waiting for clean WB ack";
+    MO_I, AccessPermission:Read_Only, desc="L1 replacement, waiting for dirty WB ack";
+
+    MO_PI, AccessPermission:Read_Only, desc="L1 downgrade, waiting for CtoD ack (or ProbeInvalidateData)";
+
+    I_C, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from TCC for canceled WB";
+  }
+
+  enumeration(Event, desc="TCP Events") {
+    // Core initiated
+    Load,           desc="Load";
+    Store,          desc="Store";
+
+    // TCC initiated
+    TCC_AckS,        desc="TCC Ack to Core Request";
+    TCC_AckE,        desc="TCC Ack to Core Request";
+    TCC_AckM,        desc="TCC Ack to Core Request";
+    TCC_AckCtoD,     desc="TCC Ack to Core Request";
+    TCC_AckWB,       desc="TCC Ack for clean WB";
+    TCC_NackWB,       desc="TCC Nack for clean WB";
+
+    // Mem sys initiated
+    Repl,           desc="Replacing block from cache";
+
+    // Probe Events
+    PrbInvData,         desc="probe, return O or M data";
+    PrbInv,             desc="probe, no need for data";
+    LocalPrbInv,             desc="local probe, no need for data";
+    PrbShrData,         desc="probe downgrade, return O or M data";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff than memory)?";
+    DataBlock DataBlk,          desc="data for the block";
+    bool FromL2, default="false", desc="block just moved from L2";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,             desc="Transient state";
+    DataBlock DataBlk,       desc="data for the block, required for concurrent writebacks";
+    bool Dirty,              desc="Is the data dirty (different than memory)?";
+    int NumPendingMsgs,      desc="Number of acks/data messages that this processor is waiting for";
+    bool Shared,             desc="Victim hit by shared probe";
+   }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<TCP_TBE>", constructor="m_number_of_TBEs";
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  // Internal functions
+  Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+    Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address));
+    return cache_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    } else {
+      return getCacheEntry(addr).DataBlk;
+    }
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if(is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+      tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+      cache_entry.CacheState := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return TCP_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return TCP_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  bool isValid(Addr addr) {
+      AccessPermission perm := getAccessPermission(addr);
+      if (perm == AccessPermission:NotPresent ||
+          perm == AccessPermission:Invalid ||
+          perm == AccessPermission:Busy) {
+          return false;
+      } else {
+          return true;
+      }
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(TCP_State_to_permission(state));
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+        L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+        L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  MachineType getCoherenceType(MachineID myMachID,
+                                      MachineID senderMachID) {
+    if(myMachID == senderMachID) {
+        return MachineType:TCP;
+    } else if(machineIDToMachineType(senderMachID) == MachineType:TCP) {
+        return MachineType:L1Cache_wCC;
+    } else if(machineIDToMachineType(senderMachID) == MachineType:TCC) {
+        return MachineType:TCC;
+    } else {
+        return MachineType:TCCdir;
+    }
+  }
+
+  // Out Ports
+
+  out_port(requestNetwork_out, CPURequestMsg, requestFromTCP);
+  out_port(responseNetwork_out, ResponseMsg, responseFromTCP);
+  out_port(unblockNetwork_out, UnblockMsg, unblockFromCore);
+
+  // In Ports
+
+  in_port(probeNetwork_in, TDProbeRequestMsg, probeToTCP) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+     peek(probeNetwork_in, TDProbeRequestMsg, block_on="addr") {
+        DPRINTF(RubySlicc, "%s\n", in_msg);
+        DPRINTF(RubySlicc, "machineID: %s\n", machineID);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == ProbeRequestType:PrbInv) {
+          if (in_msg.ReturnData) {
+            trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+          } else {
+            if(in_msg.localCtoD) {
+              trigger(Event:LocalPrbInv, in_msg.addr, cache_entry, tbe);
+            } else {
+              trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+            }
+          }
+        } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+          assert(in_msg.ReturnData);
+          trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+  in_port(responseToTCP_in, ResponseMsg, responseToTCP) {
+    if (responseToTCP_in.isReady(clockEdge())) {
+      peek(responseToTCP_in, ResponseMsg, block_on="addr") {
+
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == CoherenceResponseType:TDSysResp) {
+          if (in_msg.State == CoherenceState:Modified) {
+            if (in_msg.CtoD) {
+              trigger(Event:TCC_AckCtoD, in_msg.addr, cache_entry, tbe);
+            } else {
+              trigger(Event:TCC_AckM, in_msg.addr, cache_entry, tbe);
+            }
+          } else if (in_msg.State == CoherenceState:Shared) {
+            trigger(Event:TCC_AckS, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.State == CoherenceState:Exclusive) {
+            trigger(Event:TCC_AckE, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck) {
+          trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:TDSysWBNack) {
+          trigger(Event:TCC_NackWB, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+
+  in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+    if (mandatoryQueue_in.isReady(clockEdge())) {
+      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+        Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+        TBE tbe := TBEs.lookup(in_msg.LineAddress);
+        DPRINTF(RubySlicc, "%s\n", in_msg);
+        if (in_msg.Type == RubyRequestType:LD) {
+          if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
+            trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
+          } else {
+            Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
+            trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        } else {
+          if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
+            trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
+          } else {
+            Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
+            trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        }
+      }
+    }
+  }
+
+  // Actions
+
+  action(ic_invCache, "ic", desc="invalidate cache") {
+    if(is_valid(cache_entry)) {
+      L1cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(n_issueRdBlk, "n", desc="Issue RdBlk") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlk;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkM;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(vd_victim, "vd", desc="Victimize M/O Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      assert(is_valid(cache_entry));
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicDirty;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:O) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+      out_msg.Dirty := cache_entry.Dirty;
+    }
+  }
+
+  action(vc_victim, "vc", desc="Victimize E/S Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicClean;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:S) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+    }
+  }
+
+  action(a_allocate, "a", desc="allocate block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L1cache.allocate(address, new Entry));
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    assert(is_valid(cache_entry));
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+    tbe.DataBlk := cache_entry.DataBlk;  // Data only used for WBs
+    tbe.Dirty := cache_entry.Dirty;
+    tbe.Shared := false;
+  }
+
+  action(d_deallocateTBE, "d", desc="Deallocate TBE") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+    mandatoryQueue_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+    responseToTCP_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(l_loadDone, "l", desc="local load done") {
+    assert(is_valid(cache_entry));
+    if (use_seq_not_coal) {
+        sequencer.readCallback(address, cache_entry.DataBlk,
+                               false, MachineType:TCP);
+    } else {
+        coalescer.readCallback(address, MachineType:TCP, cache_entry.DataBlk);
+    }
+  }
+
+  action(xl_loadDone, "xl", desc="remote load done") {
+    peek(responseToTCP_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      if (use_seq_not_coal) {
+        coalescer.recordCPReadCallBack(machineID, in_msg.Sender);
+        sequencer.readCallback(address,
+                               cache_entry.DataBlk,
+                               false,
+                               machineIDToMachineType(in_msg.Sender),
+                               in_msg.InitialRequestTime,
+                               in_msg.ForwardRequestTime,
+                               in_msg.ProbeRequestStartTime);
+      } else {
+        MachineType cc_mach_type := getCoherenceType(machineID,
+                                                            in_msg.Sender);
+        coalescer.readCallback(address,
+                               cc_mach_type,
+                               cache_entry.DataBlk,
+                               in_msg.InitialRequestTime,
+                               in_msg.ForwardRequestTime,
+                               in_msg.ProbeRequestStartTime);
+      }
+    }
+  }
+
+  action(s_storeDone, "s", desc="local store done") {
+    assert(is_valid(cache_entry));
+    if (use_seq_not_coal) {
+      coalescer.recordCPWriteCallBack(machineID, machineID);
+      sequencer.writeCallback(address, cache_entry.DataBlk,
+                              false, MachineType:TCP);
+    } else {
+      coalescer.writeCallback(address, MachineType:TCP, cache_entry.DataBlk);
+    }
+    cache_entry.Dirty := true;
+  }
+
+  action(xs_storeDone, "xs", desc="remote store done") {
+    peek(responseToTCP_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      if (use_seq_not_coal) {
+        coalescer.recordCPWriteCallBack(machineID, in_msg.Sender);
+        sequencer.writeCallback(address,
+                                cache_entry.DataBlk,
+                                false,
+                                machineIDToMachineType(in_msg.Sender),
+                                in_msg.InitialRequestTime,
+                                in_msg.ForwardRequestTime,
+                                in_msg.ProbeRequestStartTime);
+      } else {
+        MachineType cc_mach_type := getCoherenceType(machineID,
+                                                            in_msg.Sender);
+        coalescer.writeCallback(address,
+                                cc_mach_type,
+                                cache_entry.DataBlk,
+                                in_msg.InitialRequestTime,
+                                in_msg.ForwardRequestTime,
+                                in_msg.ProbeRequestStartTime);
+      }
+      cache_entry.Dirty := true;
+    }
+  }
+
+  action(w_writeCache, "w", desc="write data to cache") {
+    peek(responseToTCP_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") {
+    peek(responseToTCP_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:StaleNotif;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                TCC_select_low_bit, TCC_select_num_bits));
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(wb_data, "wb", desc="write back data") {
+    peek(responseToTCP_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:CPUData;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                TCC_select_low_bit, TCC_select_num_bits));
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        if (tbe.Shared) {
+          out_msg.NbReqShared := true;
+        } else {
+          out_msg.NbReqShared := false;
+        }
+        out_msg.State := CoherenceState:Shared; // faux info
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(piu_sendProbeResponseInvUntransferredOwnership, "piu", desc="send probe ack inv, no data, retain ownership") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC, L3  respond in same way to probes
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.UntransferredOwner :=true;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC, L3  respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and TCC respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;
+      out_msg.Ntsl := true;
+      out_msg.Hit := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(prm_sendProbeResponseMiss, "prm", desc="send probe ack PrbShrData, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and TCC respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;  // only true if sending back data i think
+      out_msg.Hit := false;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry) || is_valid(tbe));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.DataBlk := getDataBlock(address);
+      if (is_valid(tbe)) {
+        out_msg.Dirty := tbe.Dirty;
+      } else {
+        out_msg.Dirty := cache_entry.Dirty;
+      }
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.isValid := isValid(address);
+      APPEND_TRANSITION_COMMENT("Sending ack with dirty ");
+      APPEND_TRANSITION_COMMENT(out_msg.Dirty);
+    }
+  }
+
+  action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry) || is_valid(tbe));
+      assert(is_valid(cache_entry));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.DataBlk := getDataBlock(address);
+      if (is_valid(tbe)) {
+        out_msg.Dirty := tbe.Dirty;
+      } else {
+        out_msg.Dirty := cache_entry.Dirty;
+      }
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.isValid := isValid(address);
+      APPEND_TRANSITION_COMMENT("Sending ack with dirty ");
+      APPEND_TRANSITION_COMMENT(out_msg.Dirty);
+      DPRINTF(RubySlicc, "Data is %s\n", out_msg.DataBlk);
+    }
+  }
+
+  action(sf_setSharedFlip, "sf", desc="hit by shared probe, status may be different") {
+    assert(is_valid(tbe));
+    tbe.Shared := true;
+  }
+
+  action(mru_updateMRU, "mru", desc="Touch block for replacement policy") {
+    L1cache.setMRU(address);
+  }
+
+  action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+    enqueue(unblockNetwork_out, UnblockMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      out_msg.wasValid := isValid(address);
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") {
+    probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
+    mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  // Transitions
+
+  // transitions from base
+  transition(I, Load, I_ES) {TagArrayRead} {
+    a_allocate;
+    n_issueRdBlk;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, Store, I_M) {TagArrayRead, TagArrayWrite} {
+    a_allocate;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, Store, S_M) {TagArrayRead} {
+    mru_updateMRU;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(E, Store, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    mru_updateMRU;
+    s_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(O, Store, O_M) {TagArrayRead, DataArrayWrite} {
+    mru_updateMRU;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(M, Store) {TagArrayRead, DataArrayWrite} {
+    mru_updateMRU;
+    s_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  // simple hit transitions
+  transition({S, E, O, M}, Load) {TagArrayRead, DataArrayRead} {
+    l_loadDone;
+    mru_updateMRU;
+    p_popMandatoryQueue;
+  }
+
+  // recycles from transients
+  transition({I_M, I_ES, ES_I, MO_I, S_M, O_M, MO_PI, I_C}, {Load, Store, Repl}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({S, E}, Repl, ES_I) {TagArrayRead} {
+    t_allocateTBE;
+    vc_victim;
+    ic_invCache;
+  }
+
+  transition({O, M}, Repl, MO_I) {TagArrayRead, DataArrayRead} {
+    t_allocateTBE;
+    vd_victim;
+    ic_invCache;
+  }
+
+  // TD event transitions
+  transition(I_M, {TCC_AckM, TCC_AckCtoD}, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    w_writeCache;
+    xs_storeDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_ES, TCC_AckS, S) {TagArrayWrite,  DataArrayWrite} {
+    w_writeCache;
+    xl_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_ES, TCC_AckE, E) {TagArrayWrite,  DataArrayWrite} {
+    w_writeCache;
+    xl_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition({S_M, O_M}, TCC_AckM, M) {TagArrayWrite, DataArrayWrite} {
+    xs_storeDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition({MO_I, ES_I}, TCC_NackWB, I){TagArrayWrite} {
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition({MO_I, ES_I}, TCC_AckWB, I) {TagArrayWrite, DataArrayRead} {
+    wb_data;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(I_C, TCC_AckWB, I) {TagArrayWrite} {
+    ss_sendStaleNotification;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(I_C, TCC_NackWB, I) {TagArrayWrite} {
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  // Probe transitions
+  transition({M, O}, PrbInvData, I) {TagArrayRead, TagArrayWrite} {
+    pd_sendProbeResponseData;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(I, PrbInvData) {TagArrayRead, TagArrayWrite} {
+    prm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition({E, S}, PrbInvData, I) {TagArrayRead, TagArrayWrite} {
+    pd_sendProbeResponseData;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbInvData, I_C) {} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  // Needed for TCC-based protocols. Must hold on to ownership till transfer complete
+  transition({M, O}, LocalPrbInv, MO_PI){TagArrayRead, TagArrayWrite} {
+    piu_sendProbeResponseInvUntransferredOwnership;
+    pp_popProbeQueue;
+  }
+
+  // If there is a race and we see a probe invalidate, handle normally.
+  transition(MO_PI, PrbInvData, I){TagArrayWrite} {
+    pd_sendProbeResponseData;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_PI, PrbInv, I){TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  // normal exit when ownership is successfully transferred
+  transition(MO_PI, TCC_AckCtoD, I) {TagArrayWrite} {
+    ic_invCache;
+    pr_popResponseQueue;
+  }
+
+  transition({M, O, E, S, I}, PrbInv, I)  {TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition({E, S, I}, LocalPrbInv, I){TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+
+  transition({M, E, O}, PrbShrData, O) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_PI, PrbShrData) {DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+
+  transition(S, PrbShrData, S) {TagArrayRead, DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({I, I_C}, PrbShrData) {TagArrayRead} {
+    prm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbInv, I_C) {} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition({I_M, I_ES}, {PrbInv, PrbInvData}){TagArrayRead} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    a_allocate;  // but make sure there is room for incoming data when it arrives
+    pp_popProbeQueue;
+  }
+
+  transition({I_M, I_ES}, PrbShrData) {} {
+    prm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition(S_M, PrbInvData, I_M) {TagArrayRead} {
+    pim_sendProbeResponseInvMs;
+    ic_invCache;
+    a_allocate;
+    pp_popProbeQueue;
+  }
+
+  transition(O_M, PrbInvData, I_M) {TagArrayRead,DataArrayRead} {
+    pdm_sendProbeResponseDataMs;
+    ic_invCache;
+    a_allocate;
+    pp_popProbeQueue;
+  }
+
+  transition({S_M, O_M}, {PrbInv}, I_M) {TagArrayRead} {
+    pim_sendProbeResponseInvMs;
+    ic_invCache;
+    a_allocate;
+    pp_popProbeQueue;
+  }
+
+  transition(S_M, {LocalPrbInv}, I_M) {TagArrayRead} {
+    pim_sendProbeResponseInvMs;
+    ic_invCache;
+    a_allocate;
+    pp_popProbeQueue;
+  }
+
+  transition(O_M, LocalPrbInv, I_M) {TagArrayRead} {
+    piu_sendProbeResponseInvUntransferredOwnership;
+    ic_invCache;
+    a_allocate;
+    pp_popProbeQueue;
+  }
+
+  transition({S_M, O_M}, PrbShrData) {DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, PrbInvData, I_C){
+    pd_sendProbeResponseData;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbInvData, I_C) {DataArrayRead} {
+    pd_sendProbeResponseData;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbInv, I_C) {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, PrbInv, I_C) {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, PrbShrData, ES_I) {DataArrayRead} {
+    pd_sendProbeResponseData;
+    sf_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbShrData, MO_I) {DataArrayRead} {
+    pd_sendProbeResponseData;
+    sf_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+}
diff --git a/src/mem/protocol/GPU_RfO.slicc b/src/mem/protocol/GPU_RfO.slicc
new file mode 100644
index 000000000..7773ce6e0
--- /dev/null
+++ b/src/mem/protocol/GPU_RfO.slicc
@@ -0,0 +1,11 @@
+protocol "GPU_AMD_Base";
+include "RubySlicc_interfaces.slicc";
+include "MOESI_AMD_Base-msg.sm";
+include "MOESI_AMD_Base-dir.sm";
+include "MOESI_AMD_Base-CorePair.sm";
+include "GPU_RfO-TCP.sm";
+include "GPU_RfO-SQC.sm";
+include "GPU_RfO-TCC.sm";
+include "GPU_RfO-TCCdir.sm";
+include "MOESI_AMD_Base-L3cache.sm";
+include "MOESI_AMD_Base-RegionBuffer.sm";
diff --git a/src/mem/protocol/GPU_VIPER-SQC.sm b/src/mem/protocol/GPU_VIPER-SQC.sm
new file mode 100644
index 000000000..8d5b5699a
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER-SQC.sm
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Blake Hechtman
+ */
+
+machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
+ : Sequencer* sequencer;
+   CacheMemory * L1cache;
+   int TCC_select_num_bits;
+   Cycles issue_latency := 80;  // time to send data down to TCC
+   Cycles l2_hit_latency := 18; // for 1MB L2, 20 for 2MB
+
+  MessageBuffer * requestFromSQC, network="To", virtual_network="1", vnet_type="request";
+
+  MessageBuffer * probeToSQC, network="From", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseToSQC, network="From", virtual_network="3", vnet_type="response";
+
+  MessageBuffer * mandatoryQueue;
+{
+  state_declaration(State, desc="SQC Cache States", default="SQC_State_I") {
+    I, AccessPermission:Invalid, desc="Invalid";
+    V, AccessPermission:Read_Only, desc="Valid";
+  }
+
+  enumeration(Event, desc="SQC Events") {
+    // Core initiated
+    Fetch,          desc="Fetch";
+    // Mem sys initiated
+    Repl,           desc="Replacing block from cache";
+    Data,           desc="Received Data";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff than memory)?";
+    DataBlock DataBlk,          desc="data for the block";
+    bool FromL2, default="false", desc="block just moved from L2";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,             desc="Transient state";
+    DataBlock DataBlk,       desc="data for the block, required for concurrent writebacks";
+    bool Dirty,              desc="Is the data dirty (different than memory)?";
+    int NumPendingMsgs,      desc="Number of acks/data messages that this processor is waiting for";
+    bool Shared,             desc="Victim hit by shared probe";
+   }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<SQC_TBE>", constructor="m_number_of_TBEs";
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  // Internal functions
+  Tick clockEdge();
+
+  Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+    Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address));
+    return cache_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    } else {
+      return getCacheEntry(addr).DataBlk;
+    }
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if(is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+      tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+      cache_entry.CacheState := state;
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes +
+        functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return SQC_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return SQC_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(SQC_State_to_permission(state));
+    }
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+        L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+        L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  // Out Ports
+
+  out_port(requestNetwork_out, CPURequestMsg, requestFromSQC);
+
+  // In Ports
+
+  in_port(responseToSQC_in, ResponseMsg, responseToSQC) {
+    if (responseToSQC_in.isReady(clockEdge())) {
+      peek(responseToSQC_in, ResponseMsg, block_on="addr") {
+
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == CoherenceResponseType:TDSysResp) {
+          if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) {
+            trigger(Event:Data, in_msg.addr, cache_entry, tbe);
+          } else {
+            Addr victim := L1cache.cacheProbe(in_msg.addr);
+            trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        } else {
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+
+  in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+    if (mandatoryQueue_in.isReady(clockEdge())) {
+      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+        Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+        TBE tbe := TBEs.lookup(in_msg.LineAddress);
+
+        assert(in_msg.Type == RubyRequestType:IFETCH);
+        trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
+      }
+    }
+  }
+
+  // Actions
+
+  action(ic_invCache, "ic", desc="invalidate cache") {
+    if(is_valid(cache_entry)) {
+      L1cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlk;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(a_allocate, "a", desc="allocate block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L1cache.allocate(address, new Entry));
+    }
+  }
+
+  action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+    mandatoryQueue_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+    responseToSQC_in.dequeue(clockEdge());
+  }
+
+  action(l_loadDone, "l", desc="local load done") {
+    assert(is_valid(cache_entry));
+    sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
+    APPEND_TRANSITION_COMMENT(cache_entry.DataBlk);
+  }
+
+  action(w_writeCache, "w", desc="write data to cache") {
+    peek(responseToSQC_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := false;
+    }
+  }
+
+  // Transitions
+
+  // transitions from base
+  transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} {
+    ic_invCache
+  }
+
+  transition(I, Data, V) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    a_allocate;
+    w_writeCache
+    l_loadDone;
+    pr_popResponseQueue;
+  }
+
+  transition(I, Fetch) {TagArrayRead, TagArrayWrite} {
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  // simple hit transitions
+  transition(V, Fetch) {TagArrayRead, DataArrayRead} {
+    l_loadDone;
+    p_popMandatoryQueue;
+  }
+}
diff --git a/src/mem/protocol/GPU_VIPER-TCC.sm b/src/mem/protocol/GPU_VIPER-TCC.sm
new file mode 100644
index 000000000..f62df9f4f
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER-TCC.sm
@@ -0,0 +1,739 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Blake Hechtman
+ */
+
+machine(MachineType:TCC, "TCC Cache")
+ : CacheMemory * L2cache;
+   bool WB; /*is this cache Writeback?*/
+   Cycles l2_request_latency := 50;
+   Cycles l2_response_latency := 20;
+
+  // From the TCPs or SQCs
+  MessageBuffer * requestFromTCP, network="From", virtual_network="1", vnet_type="request";
+  // To the Cores. TCC deals only with TCPs/SQCs.
+  MessageBuffer * responseToCore, network="To", virtual_network="3", vnet_type="response";
+  // From the NB
+  MessageBuffer * probeFromNB, network="From", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseFromNB, network="From", virtual_network="2", vnet_type="response";
+  // To the NB
+  MessageBuffer * requestToNB, network="To", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseToNB, network="To", virtual_network="2", vnet_type="response";
+  MessageBuffer * unblockToNB, network="To", virtual_network="4", vnet_type="unblock";
+
+  MessageBuffer * triggerQueue;
+
+{
+  // EVENTS
+  enumeration(Event, desc="TCC Events") {
+    // Requests coming from the Cores
+    RdBlk,                  desc="RdBlk event";
+    WrVicBlk,               desc="L1 Write Through";
+    WrVicBlkBack,           desc="L1 Write Through(dirty cache)";
+    Atomic,                 desc="Atomic Op";
+    AtomicDone,             desc="AtomicOps Complete";
+    AtomicNotDone,          desc="AtomicOps not Complete";
+    Data,                   desc="data messgae";
+    // Coming from this TCC
+    L2_Repl,                desc="L2 Replacement";
+    // Probes
+    PrbInv,                 desc="Invalidating probe";
+    // Coming from Memory Controller
+    WBAck,                  desc="writethrough ack from memory";
+  }
+
+  // STATES
+  state_declaration(State, desc="TCC State", default="TCC_State_I") {
+    M, AccessPermission:Read_Write, desc="Modified(dirty cache only)";
+    W, AccessPermission:Read_Write, desc="Written(dirty cache only)";
+    V, AccessPermission:Read_Only,  desc="Valid";
+    I, AccessPermission:Invalid,    desc="Invalid";
+    IV, AccessPermission:Busy,      desc="Waiting for Data";
+    WI, AccessPermission:Busy,      desc="Waiting on Writethrough Ack";
+    A, AccessPermission:Busy,       desc="Invalid waiting on atomici Data";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+
+  // STRUCTURES
+
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff from memory?)";
+    DataBlock DataBlk,          desc="Data for the block";
+    WriteMask writeMask,        desc="Dirty byte mask";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,     desc="Transient state";
+    DataBlock DataBlk,  desc="data for the block";
+    bool Dirty,         desc="Is the data dirty?";
+    bool Shared,        desc="Victim hit by shared probe";
+    MachineID From,     desc="Waiting for writeback from...";
+    NetDest Destination, desc="Data destination";
+    int numAtomics,     desc="number remaining atomics";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<TCC_TBE>", constructor="m_number_of_TBEs";
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+
+
+  // FUNCTION DEFINITIONS
+  Tick clockEdge();
+
+  Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+    return static_cast(Entry, "pointer", L2cache.lookup(addr));
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    return getCacheEntry(addr).DataBlk;
+  }
+
+  bool presentOrAvail(Addr addr) {
+    return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr);
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if (is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+        tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+        cache_entry.CacheState := state;
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes +
+        functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return TCC_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return TCC_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(TCC_State_to_permission(state));
+    }
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+        L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+        L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+        L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+        L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+
+  // ** OUT_PORTS **
+
+  // Three classes of ports
+  // Class 1: downward facing network links to NB
+  out_port(requestToNB_out, CPURequestMsg, requestToNB);
+  out_port(responseToNB_out, ResponseMsg, responseToNB);
+  out_port(unblockToNB_out, UnblockMsg, unblockToNB);
+
+  // Class 2: upward facing ports to GPU cores
+  out_port(responseToCore_out, ResponseMsg, responseToCore);
+
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+  //
+  // request queue going to NB
+  //
+
+
+// ** IN_PORTS **
+  in_port(triggerQueue_in, TiggerMsg, triggerQueue) {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (tbe.numAtomics == 0) {
+            trigger(Event:AtomicDone, in_msg.addr, cache_entry, tbe);
+        } else {
+            trigger(Event:AtomicNotDone, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+
+
+  in_port(responseFromNB_in, ResponseMsg, responseFromNB) {
+    if (responseFromNB_in.isReady(clockEdge())) {
+      peek(responseFromNB_in, ResponseMsg, block_on="addr") {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+          if(presentOrAvail(in_msg.addr)) {
+            trigger(Event:Data, in_msg.addr, cache_entry, tbe);
+          } else {
+            Addr victim :=  L2cache.cacheProbe(in_msg.addr);
+            trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+          trigger(Event:WBAck, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+
+  // Finally handling incoming requests (from TCP) and probes (from NB).
+  in_port(probeNetwork_in, NBProbeRequestMsg, probeFromNB) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, NBProbeRequestMsg) {
+        DPRINTF(RubySlicc, "%s\n", in_msg);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+      }
+    }
+  }
+
+  in_port(coreRequestNetwork_in, CPURequestMsg, requestFromTCP, rank=0) {
+    if (coreRequestNetwork_in.isReady(clockEdge())) {
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+            if(WB) {
+                if(presentOrAvail(in_msg.addr)) {
+                    trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry, tbe);
+                } else {
+                    Addr victim :=  L2cache.cacheProbe(in_msg.addr);
+                    trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+                }
+            } else {
+                trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
+            }
+        } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+          trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
+          trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+        } else {
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+  // BEGIN ACTIONS
+
+  action(i_invL2, "i", desc="invalidate TCC cache block") {
+    if (is_valid(cache_entry)) {
+        L2cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(sd_sendData, "sd", desc="send Shared response") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := false;
+        out_msg.State := CoherenceState:Shared;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+
+  action(sdr_sendDataResponse, "sdr", desc="send Shared response") {
+    enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:TDSysResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination := tbe.Destination;
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := false;
+      out_msg.State := CoherenceState:Shared;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+    enqueue(unblockToNB_out, UnblockMsg, 1) {
+      out_msg.addr := address;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+
+  action(rd_requestData, "r", desc="Miss in L2, pass on") {
+    if(tbe.Destination.count()==1){
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+          out_msg.addr := address;
+          out_msg.Type := in_msg.Type;
+          out_msg.Requestor := machineID;
+          out_msg.Destination.add(map_Address_to_Directory(address));
+          out_msg.Shared := false; // unneeded for this request
+          out_msg.MessageSize := in_msg.MessageSize;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+      }
+    }
+  }
+
+  action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+    peek(responseFromNB_in, ResponseMsg) {
+        enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:TDSysWBAck;
+          out_msg.Destination.clear();
+          out_msg.Destination.add(in_msg.WTRequestor);
+          out_msg.Sender := machineID;
+          out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        }
+    }
+  }
+
+  action(swb_sendWBAck, "swb", desc="send WB Ack") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysWBAck;
+        out_msg.Destination.clear();
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+      }
+    }
+  }
+
+  action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") {
+    peek(responseFromNB_in, ResponseMsg) {
+        enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:TDSysResp;
+          out_msg.Destination.add(in_msg.WTRequestor);
+          out_msg.Sender := machineID;
+          out_msg.MessageSize := in_msg.MessageSize;
+          out_msg.DataBlk := in_msg.DataBlk;
+        }
+    }
+  }
+
+  action(a_allocateBlock, "a", desc="allocate TCC block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L2cache.allocate(address, new Entry));
+      cache_entry.writeMask.clear();
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    if (is_invalid(tbe)) {
+      check_allocate(TBEs);
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      tbe.Destination.clear();
+      tbe.numAtomics := 0;
+    }
+    if (coreRequestNetwork_in.isReady(clockEdge())) {
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type == CoherenceRequestType:Atomic){
+          tbe.Destination.add(in_msg.Requestor);
+        }
+      }
+    }
+  }
+
+  action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") {
+    tbe.Destination.clear();
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(wcb_writeCacheBlock, "wcb", desc="write data to TCC") {
+    peek(responseFromNB_in, ResponseMsg) {
+      cache_entry.DataBlk := in_msg.DataBlk;
+      DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
+    }
+  }
+
+  action(wdb_writeDirtyBytes, "wdb", desc="write data to TCC") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      cache_entry.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask);
+      cache_entry.writeMask.orMask(in_msg.writeMask);
+      DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
+    }
+  }
+
+  action(wt_writeThrough, "wt", desc="write back data") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+        out_msg.addr := address;
+        out_msg.Requestor := machineID;
+        out_msg.WTRequestor := in_msg.Requestor;
+        out_msg.Destination.add(map_Address_to_Directory(address));
+        out_msg.MessageSize := MessageSizeType:Data;
+        out_msg.Type := CoherenceRequestType:WriteThrough;
+        out_msg.Dirty := true;
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.writeMask.orMask(in_msg.writeMask);
+      }
+    }
+  }
+
+  action(wb_writeBack, "wb", desc="write back data") {
+    enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      out_msg.WTRequestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Data;
+      out_msg.Type := CoherenceRequestType:WriteThrough;
+      out_msg.Dirty := true;
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.writeMask.orMask(cache_entry.writeMask);
+    }
+  }
+
+  action(at_atomicThrough, "at", desc="write back data") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+        out_msg.addr := address;
+        out_msg.Requestor := machineID;
+        out_msg.WTRequestor := in_msg.Requestor;
+        out_msg.Destination.add(map_Address_to_Directory(address));
+        out_msg.MessageSize := MessageSizeType:Data;
+        out_msg.Type := CoherenceRequestType:Atomic;
+        out_msg.Dirty := true;
+        out_msg.writeMask.orMask(in_msg.writeMask);
+      }
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(responseToNB_out, ResponseMsg, 1) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC, L3  respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+  action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
+    L2cache.setMRU(address);
+  }
+
+  action(p_popRequestQueue, "p", desc="pop request queue") {
+    coreRequestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="pop response queue") {
+    responseFromNB_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(z_stall, "z", desc="stall") {
+      // built-in
+  }
+
+
+  action(ina_incrementNumAtomics, "ina", desc="inc num atomics") {
+    tbe.numAtomics := tbe.numAtomics + 1;
+  }
+
+
+  action(dna_decrementNumAtomics, "dna", desc="inc num atomics") {
+    tbe.numAtomics := tbe.numAtomics - 1;
+    if (tbe.numAtomics==0) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:AtomicDone;
+      }
+    }
+  }
+
+  action(ptr_popTriggerQueue, "ptr", desc="pop Trigger") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  // END ACTIONS
+
+  // BEGIN TRANSITIONS
+  // transitions from base
+  // Assumptions for ArrayRead/Write
+  // TBE checked before tags
+  // Data Read/Write requires Tag Read
+
+  // Stalling transitions do NOT check the tag array...and if they do,
+  // they can cause a resource stall deadlock!
+
+  transition(WI, {RdBlk, WrVicBlk, Atomic, WrVicBlkBack}) { //TagArrayRead} {
+      z_stall;
+  }
+  transition(A, {RdBlk, WrVicBlk, WrVicBlkBack}) { //TagArrayRead} {
+      z_stall;
+  }
+  transition(IV, {WrVicBlk, Atomic, WrVicBlkBack}) { //TagArrayRead} {
+      z_stall;
+  }
+  transition({M, V}, RdBlk) {TagArrayRead, DataArrayRead} {
+    sd_sendData;
+    ut_updateTag;
+    p_popRequestQueue;
+  }
+  transition(W, RdBlk, WI) {TagArrayRead, DataArrayRead} {
+    t_allocateTBE;
+    wb_writeBack;
+  }
+
+  transition(I, RdBlk, IV) {TagArrayRead} {
+    t_allocateTBE;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+  transition(IV, RdBlk) {
+    t_allocateTBE;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+  transition({V, I},Atomic, A) {TagArrayRead} {
+    i_invL2;
+    t_allocateTBE;
+    at_atomicThrough;
+    ina_incrementNumAtomics;
+    p_popRequestQueue;
+  }
+
+  transition(A, Atomic) {
+    at_atomicThrough;
+    ina_incrementNumAtomics;
+    p_popRequestQueue;
+  }
+
+  transition({M, W}, Atomic, WI) {TagArrayRead} {
+    t_allocateTBE;
+    wb_writeBack;
+  }
+
+  transition(I, WrVicBlk) {TagArrayRead} {
+    wt_writeThrough;
+    p_popRequestQueue;
+  }
+
+  transition(V, WrVicBlk) {TagArrayRead, DataArrayWrite} {
+    ut_updateTag;
+    wdb_writeDirtyBytes;
+    wt_writeThrough;
+    p_popRequestQueue;
+  }
+
+  transition({V, M}, WrVicBlkBack, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    ut_updateTag;
+    swb_sendWBAck;
+    wdb_writeDirtyBytes;
+    p_popRequestQueue;
+  }
+
+  transition(W, WrVicBlkBack) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    ut_updateTag;
+    swb_sendWBAck;
+    wdb_writeDirtyBytes;
+    p_popRequestQueue;
+  }
+
+  transition(I, WrVicBlkBack, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocateBlock;
+    ut_updateTag;
+    swb_sendWBAck;
+    wdb_writeDirtyBytes;
+    p_popRequestQueue;
+  }
+
+  transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} {
+    t_allocateTBE;
+    wb_writeBack;
+    i_invL2;
+  }
+
+  transition({I, V}, L2_Repl, I) {TagArrayRead, TagArrayWrite} {
+    i_invL2;
+  }
+
+  transition({A, IV, WI}, L2_Repl) {
+    i_invL2;
+  }
+
+  transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition(M, PrbInv, W) {TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition(W, PrbInv) {TagArrayRead} {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({A, IV, WI}, PrbInv) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocateBlock;
+    ut_updateTag;
+    wcb_writeCacheBlock;
+    sdr_sendDataResponse;
+    pr_popResponseQueue;
+    dt_deallocateTBE;
+  }
+
+  transition(A, Data) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocateBlock;
+    ar_sendAtomicResponse;
+    dna_decrementNumAtomics;
+    pr_popResponseQueue;
+  }
+
+  transition(A, AtomicDone, I) {TagArrayRead, TagArrayWrite} {
+    dt_deallocateTBE;
+    ptr_popTriggerQueue;
+  }
+
+  transition(A, AtomicNotDone) {TagArrayRead} {
+    ptr_popTriggerQueue;
+  }
+
+  //M,W should not see WBAck as the cache is in WB mode
+  //WBAcks do not need to check tags
+  transition({I, V, IV, A}, WBAck) {
+    w_sendResponseWBAck;
+    pr_popResponseQueue;
+  }
+
+  transition(WI, WBAck,I) {
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+}
diff --git a/src/mem/protocol/GPU_VIPER-TCP.sm b/src/mem/protocol/GPU_VIPER-TCP.sm
new file mode 100644
index 000000000..d81196b17
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER-TCP.sm
@@ -0,0 +1,747 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Blake Hechtman
+ */
+
+machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
+ : VIPERCoalescer* coalescer;
+   Sequencer* sequencer;
+   bool use_seq_not_coal;
+   CacheMemory * L1cache;
+   bool WB; /*is this cache Writeback?*/
+   bool disableL1; /* bypass L1 cache? */
+   int TCC_select_num_bits;
+   Cycles issue_latency := 40;  // time to send data down to TCC
+   Cycles l2_hit_latency := 18;
+
+  MessageBuffer * requestFromTCP, network="To", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseFromTCP, network="To", virtual_network="3", vnet_type="response";
+  MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock";
+
+  MessageBuffer * probeToTCP, network="From", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseToTCP, network="From", virtual_network="3", vnet_type="response";
+  MessageBuffer * mandatoryQueue;
+
+{
+  state_declaration(State, desc="TCP Cache States", default="TCP_State_I") {
+    I, AccessPermission:Invalid, desc="Invalid";
+    V, AccessPermission:Read_Only, desc="Valid";
+    W, AccessPermission:Read_Write, desc="Written";
+    M, AccessPermission:Read_Write, desc="Written and Valid";
+    L, AccessPermission:Read_Write, desc="Local access is modifable";
+    A, AccessPermission:Invalid, desc="Waiting on Atomic";
+  }
+
+  enumeration(Event, desc="TCP Events") {
+    // Core initiated
+    Load,           desc="Load";
+    Store,          desc="Store to L1 (L1 is dirty)";
+    StoreThrough,   desc="Store directly to L2(L1 is clean)";
+    StoreLocal,     desc="Store to L1 but L1 is clean";
+    Atomic,         desc="Atomic";
+    Flush,          desc="Flush if dirty(wbL1 for Store Release)";
+    Evict,          desc="Evict if clean(invL1 for Load Acquire)";
+    // Mem sys initiated
+    Repl,           desc="Replacing block from cache";
+
+    // TCC initiated
+    TCC_Ack,        desc="TCC Ack to Core Request";
+    TCC_AckWB,      desc="TCC Ack for WB";
+    // Disable L1 cache
+    Bypass,         desc="Bypass the entire L1 cache";
+ }
+
+  enumeration(RequestType,
+              desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+    TagArrayFlash,    desc="Flash clear the data array";
+  }
+
+
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff than memory)?";
+    DataBlock DataBlk,          desc="data for the block";
+    bool FromL2, default="false", desc="block just moved from L2";
+    WriteMask writeMask, desc="written bytes masks";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,    desc="Transient state";
+    DataBlock DataBlk, desc="data for the block, required for concurrent writebacks";
+    bool Dirty,        desc="Is the data dirty (different than memory)?";
+    int NumPendingMsgs,desc="Number of acks/data messages that this processor is waiting for";
+    bool Shared,       desc="Victim hit by shared probe";
+   }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<TCP_TBE>", constructor="m_number_of_TBEs";
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int WTcnt, default="0";
+  int Fcnt, default="0";
+  bool inFlush, default="false";
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  // Internal functions
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+  Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+    Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address));
+    return cache_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    } else {
+      return getCacheEntry(addr).DataBlk;
+    }
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if (is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+      tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+      cache_entry.CacheState := state;
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes +
+        functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return TCP_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return TCP_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  bool isValid(Addr addr) {
+      AccessPermission perm := getAccessPermission(addr);
+      if (perm == AccessPermission:NotPresent ||
+          perm == AccessPermission:Invalid ||
+          perm == AccessPermission:Busy) {
+          return false;
+      } else {
+          return true;
+      }
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(TCP_State_to_permission(state));
+    }
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+        L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+        L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayFlash) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayFlash) {
+      // FIXME should check once per cache, rather than once per cacheline
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  // Out Ports
+
+  out_port(requestNetwork_out, CPURequestMsg, requestFromTCP);
+
+  // In Ports
+
+  in_port(responseToTCP_in, ResponseMsg, responseToTCP) {
+    if (responseToTCP_in.isReady(clockEdge())) {
+      peek(responseToTCP_in, ResponseMsg, block_on="addr") {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:TDSysResp) {
+          // disable L1 cache
+          if (disableL1) {
+	    trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
+          } else {
+            if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) {
+              trigger(Event:TCC_Ack, in_msg.addr, cache_entry, tbe);
+            } else {
+              Addr victim := L1cache.cacheProbe(in_msg.addr);
+              trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+            }
+          }
+        } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck ||
+                     in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+            trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
+          } else {
+            error("Unexpected Response Message to Core");
+          }
+      }
+    }
+  }
+
+  in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+    if (mandatoryQueue_in.isReady(clockEdge())) {
+      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+        Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+        TBE tbe := TBEs.lookup(in_msg.LineAddress);
+        DPRINTF(RubySlicc, "%s\n", in_msg);
+        if (in_msg.Type == RubyRequestType:LD) {
+          trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
+        } else if (in_msg.Type == RubyRequestType:ATOMIC) {
+          trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe);
+        } else if (in_msg.Type == RubyRequestType:ST) {
+          if(disableL1) {
+            trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
+          } else {
+            if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
+              if (in_msg.segment == HSASegment:SPILL) {
+                trigger(Event:StoreLocal, in_msg.LineAddress, cache_entry, tbe);
+              } else if (WB) {
+                trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
+              } else {
+                trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
+              }
+            } else {
+              Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
+              trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+            }
+          } // end if (disableL1)
+        } else if (in_msg.Type == RubyRequestType:FLUSH) {
+            trigger(Event:Flush, in_msg.LineAddress, cache_entry, tbe);
+        } else if (in_msg.Type == RubyRequestType:REPLACEMENT){
+            trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
+        } else {
+          error("Unexpected Request Message from VIC");
+          if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
+            if (WB) {
+                trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
+            } else {
+                trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
+            }
+          } else {
+            Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
+            trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        }
+      }
+    }
+  }
+
+  // Actions
+
+  action(ic_invCache, "ic", desc="invalidate cache") {
+    if(is_valid(cache_entry)) {
+      cache_entry.writeMask.clear();
+      L1cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(n_issueRdBlk, "n", desc="Issue RdBlk") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlk;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(rb_bypassDone, "rb", desc="bypass L1 of read access") {
+    peek(responseToTCP_in, ResponseMsg) {
+      DataBlock tmp:= in_msg.DataBlk;
+      if (use_seq_not_coal) {
+        sequencer.readCallback(address, tmp, false, MachineType:L1Cache);
+      } else {
+        coalescer.readCallback(address, MachineType:L1Cache, tmp);
+      }
+      if(is_valid(cache_entry)) {
+        unset_cache_entry();
+      }
+    }
+  }
+
+  action(wab_bypassDone, "wab", desc="bypass L1 of write access") {
+    peek(responseToTCP_in, ResponseMsg) {
+      DataBlock tmp := in_msg.DataBlk;
+      if (use_seq_not_coal) {
+        sequencer.writeCallback(address, tmp, false, MachineType:L1Cache);
+      } else {
+        coalescer.writeCallback(address, MachineType:L1Cache, tmp);
+      }
+    }
+  }
+
+  action(norl_issueRdBlkOrloadDone, "norl", desc="local load done") {
+    peek(mandatoryQueue_in, RubyRequest){
+      if (cache_entry.writeMask.cmpMask(in_msg.writeMask)) {
+          if (use_seq_not_coal) {
+            sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
+          } else {
+            coalescer.readCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
+          }
+      } else {
+        enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceRequestType:RdBlk;
+          out_msg.Requestor := machineID;
+          out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+          out_msg.MessageSize := MessageSizeType:Request_Control;
+          out_msg.InitialRequestTime := curCycle();
+        }
+      }
+    }
+  }
+
+  action(wt_writeThrough, "wt", desc="Flush dirty data") {
+    WTcnt := WTcnt + 1;
+    APPEND_TRANSITION_COMMENT("write++ = ");
+    APPEND_TRANSITION_COMMENT(WTcnt);
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      assert(is_valid(cache_entry));
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.writeMask.clear();
+      out_msg.writeMask.orMask(cache_entry.writeMask);
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Data;
+      out_msg.Type := CoherenceRequestType:WriteThrough;
+      out_msg.InitialRequestTime := curCycle();
+      out_msg.Shared := false;
+    }
+  }
+
+  action(at_atomicThrough, "at", desc="send Atomic") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Requestor := machineID;
+        out_msg.writeMask.clear();
+        out_msg.writeMask.orMask(in_msg.writeMask);
+        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                TCC_select_low_bit, TCC_select_num_bits));
+        out_msg.MessageSize := MessageSizeType:Data;
+        out_msg.Type := CoherenceRequestType:Atomic;
+        out_msg.InitialRequestTime := curCycle();
+        out_msg.Shared := false;
+      }
+    }
+  }
+
+  action(a_allocate, "a", desc="allocate block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L1cache.allocate(address, new Entry));
+    }
+    cache_entry.writeMask.clear();
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+  }
+
+  action(d_deallocateTBE, "d", desc="Deallocate TBE") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(sf_setFlush, "sf", desc="set flush") {
+    inFlush := true;
+    APPEND_TRANSITION_COMMENT(" inFlush is true");
+  }
+
+  action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+    mandatoryQueue_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+    responseToTCP_in.dequeue(clockEdge());
+  }
+
+  action(l_loadDone, "l", desc="local load done") {
+    assert(is_valid(cache_entry));
+    if (use_seq_not_coal) {
+      sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
+    } else {
+      coalescer.readCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
+    }
+  }
+
+  action(s_storeDone, "s", desc="local store done") {
+    assert(is_valid(cache_entry));
+
+    if (use_seq_not_coal) {
+      sequencer.writeCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
+    } else {
+      coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
+    }
+    cache_entry.Dirty := true;
+  }
+
+  action(inv_invDone, "inv", desc="local inv done") {
+    if (use_seq_not_coal) {
+        DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n");
+        assert(false);
+    } else {
+      coalescer.invCallback(address);
+    }
+  }
+
+  action(wb_wbDone, "wb", desc="local wb done") {
+    if (inFlush == true) {
+      Fcnt := Fcnt + 1;
+      if (Fcnt > WTcnt) {
+        if (use_seq_not_coal) {
+            DPRINTF(RubySlicc, "Sequencer does not define wbCallback!\n");
+            assert(false);
+        } else {
+          coalescer.wbCallback(address);
+        }
+        Fcnt := Fcnt - 1;
+      }
+      if (WTcnt == 0 && Fcnt == 0) {
+        inFlush := false;
+        APPEND_TRANSITION_COMMENT(" inFlush is false");
+      }
+    }
+  }
+
+  action(wd_wtDone, "wd", desc="writethrough done") {
+    WTcnt := WTcnt - 1;
+    if (inFlush == true) {
+      Fcnt := Fcnt -1;
+    }
+    assert(WTcnt >= 0);
+    APPEND_TRANSITION_COMMENT("write-- = ");
+    APPEND_TRANSITION_COMMENT(WTcnt);
+  }
+
+  action(dw_dirtyWrite, "dw", desc="update write mask"){
+    peek(mandatoryQueue_in, RubyRequest) {
+      cache_entry.DataBlk.copyPartial(in_msg.WTData,in_msg.writeMask);
+      cache_entry.writeMask.orMask(in_msg.writeMask);
+    }
+  }
+  action(w_writeCache, "w", desc="write data to cache") {
+    peek(responseToTCP_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      DataBlock tmp := in_msg.DataBlk;
+      tmp.copyPartial(cache_entry.DataBlk,cache_entry.writeMask);
+      cache_entry.DataBlk := tmp;
+    }
+  }
+
+  action(mru_updateMRU, "mru", desc="Touch block for replacement policy") {
+    L1cache.setMRU(address);
+  }
+
+//  action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
+//    mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+//  }
+
+  action(z_stall, "z", desc="stall; built-in") {
+      // built-int action
+  }
+
+  // Transitions
+  // ArrayRead/Write assumptions:
+  // All requests read Tag Array
+  // TBE allocation write the TagArray to I
+  // TBE only checked on misses
+  // Stores will also write dirty bits in the tag
+  // WriteThroughs still need to use cache entry as staging buffer for wavefront
+
+  // Stalling transitions do NOT check the tag array...and if they do,
+  // they can cause a resource stall deadlock!
+
+  transition({A}, {Load, Store, Atomic, StoreThrough}) { //TagArrayRead} {
+      z_stall;
+  }
+
+  transition({M, V, L}, Load) {TagArrayRead, DataArrayRead} {
+    l_loadDone;
+    mru_updateMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, Load) {TagArrayRead} {
+    n_issueRdBlk;
+    p_popMandatoryQueue;
+  }
+
+  transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    mru_updateMRU;
+    at_atomicThrough;
+    p_popMandatoryQueue;
+  }
+
+  transition({M, W}, Atomic, A) {TagArrayRead, TagArrayWrite} {
+    wt_writeThrough;
+    t_allocateTBE;
+    at_atomicThrough;
+    ic_invCache;
+  }
+
+  transition(W, Load, I) {TagArrayRead, DataArrayRead} {
+    wt_writeThrough;
+    norl_issueRdBlkOrloadDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({I}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocate;
+    dw_dirtyWrite;
+    s_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({L, V}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    dw_dirtyWrite;
+    mru_updateMRU;
+    s_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, Store, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocate;
+    dw_dirtyWrite;
+    s_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(V, Store, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    dw_dirtyWrite;
+    mru_updateMRU;
+    s_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({M, W}, Store) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    dw_dirtyWrite;
+    mru_updateMRU;
+    s_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  //M,W should not see storeThrough
+  transition(I, StoreThrough) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocate;
+    dw_dirtyWrite;
+    s_storeDone;
+    wt_writeThrough;
+    ic_invCache;
+    p_popMandatoryQueue;
+  }
+
+  transition({V,L}, StoreThrough, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    dw_dirtyWrite;
+    s_storeDone;
+    wt_writeThrough;
+    ic_invCache;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, TCC_Ack, V) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} {
+    a_allocate;
+    w_writeCache;
+    l_loadDone;
+    pr_popResponseQueue;
+  }
+
+  transition(I, Bypass, I) {
+    rb_bypassDone;
+    pr_popResponseQueue;
+  }
+
+  transition(A, Bypass, I){
+    d_deallocateTBE;
+    wab_bypassDone;
+    pr_popResponseQueue;
+  }
+
+  transition(A, TCC_Ack, I) {TagArrayRead, DataArrayRead, DataArrayWrite} {
+    d_deallocateTBE;
+    a_allocate;
+    w_writeCache;
+    s_storeDone;
+    pr_popResponseQueue;
+    ic_invCache;
+  }
+
+  transition(V, TCC_Ack, V) {TagArrayRead, DataArrayRead, DataArrayWrite} {
+    w_writeCache;
+    l_loadDone;
+    pr_popResponseQueue;
+  }
+
+  transition({W, M}, TCC_Ack, M) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} {
+    w_writeCache;
+    l_loadDone;
+    pr_popResponseQueue;
+  }
+
+  transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} {
+    ic_invCache;
+  }
+
+  transition({A}, Repl) {TagArrayRead, TagArrayWrite} {
+    ic_invCache;
+  }
+
+  transition({W, M}, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    wt_writeThrough;
+    ic_invCache;
+  }
+
+  transition(L, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    wt_writeThrough;
+    ic_invCache;
+  }
+
+  transition({W, M}, Flush, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    sf_setFlush;
+    wt_writeThrough;
+    ic_invCache;
+    p_popMandatoryQueue;
+  }
+
+  transition({V, I, A, L},Flush) {TagArrayFlash} {
+    sf_setFlush;
+    wb_wbDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({I, V}, Evict, I) {TagArrayFlash} {
+    inv_invDone;
+    p_popMandatoryQueue;
+    ic_invCache;
+  }
+
+  transition({W, M}, Evict, W) {TagArrayFlash} {
+    inv_invDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({A, L}, Evict) {TagArrayFlash} {
+    inv_invDone;
+    p_popMandatoryQueue;
+  }
+
+  // TCC_AckWB only snoops TBE
+  transition({V, I, A, M, W, L}, TCC_AckWB) {
+    wd_wtDone;
+    wb_wbDone;
+    pr_popResponseQueue;
+  }
+}
diff --git a/src/mem/protocol/GPU_VIPER.slicc b/src/mem/protocol/GPU_VIPER.slicc
new file mode 100644
index 000000000..45f7f3477
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER.slicc
@@ -0,0 +1,9 @@
+protocol "GPU_VIPER";
+include "RubySlicc_interfaces.slicc";
+include "MOESI_AMD_Base-msg.sm";
+include "MOESI_AMD_Base-dir.sm";
+include "MOESI_AMD_Base-CorePair.sm";
+include "GPU_VIPER-TCP.sm";
+include "GPU_VIPER-SQC.sm";
+include "GPU_VIPER-TCC.sm";
+include "MOESI_AMD_Base-L3cache.sm";
diff --git a/src/mem/protocol/GPU_VIPER_Baseline.slicc b/src/mem/protocol/GPU_VIPER_Baseline.slicc
new file mode 100644
index 000000000..49bdce38c
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER_Baseline.slicc
@@ -0,0 +1,9 @@
+protocol "GPU_VIPER";
+include "RubySlicc_interfaces.slicc";
+include "MOESI_AMD_Base-msg.sm";
+include "MOESI_AMD_Base-probeFilter.sm";
+include "MOESI_AMD_Base-CorePair.sm";
+include "GPU_VIPER-TCP.sm";
+include "GPU_VIPER-SQC.sm";
+include "GPU_VIPER-TCC.sm";
+include "MOESI_AMD_Base-L3cache.sm";
diff --git a/src/mem/protocol/GPU_VIPER_Region-TCC.sm b/src/mem/protocol/GPU_VIPER_Region-TCC.sm
new file mode 100644
index 000000000..c3aef15a3
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER_Region-TCC.sm
@@ -0,0 +1,773 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor, Blake Hechtman
+ */
+
+/*
+ * This file is inherited from GPU_VIPER-TCC.sm and retains its structure.
+ * There are very few modifications in this file from the original VIPER TCC
+ */
+
+machine(MachineType:TCC, "TCC Cache")
+ : CacheMemory * L2cache;
+   bool WB; /*is this cache Writeback?*/
+   int regionBufferNum;
+   Cycles l2_request_latency := 50;
+   Cycles l2_response_latency := 20;
+
+  // From the TCPs or SQCs
+  MessageBuffer * requestFromTCP, network="From", virtual_network="1", ordered="true", vnet_type="request";
+  // To the Cores. TCC deals only with TCPs/SQCs. CP cores do not communicate directly with TCC.
+  MessageBuffer * responseToCore, network="To", virtual_network="3", ordered="true", vnet_type="response";
+  // From the NB
+  MessageBuffer * probeFromNB, network="From", virtual_network="0", ordered="false", vnet_type="request";
+  MessageBuffer * responseFromNB, network="From", virtual_network="2", ordered="false", vnet_type="response";
+  // To the NB
+  MessageBuffer * requestToNB, network="To", virtual_network="0", ordered="false", vnet_type="request";
+  MessageBuffer * responseToNB, network="To", virtual_network="2", ordered="false", vnet_type="response";
+  MessageBuffer * unblockToNB, network="To", virtual_network="4", ordered="false", vnet_type="unblock";
+
+  MessageBuffer * triggerQueue, ordered="true", random="false";
+{
+  // EVENTS
+  enumeration(Event, desc="TCC Events") {
+    // Requests coming from the Cores
+    RdBlk,                  desc="RdBlk event";
+    WrVicBlk,               desc="L1 Write Through";
+    WrVicBlkBack,           desc="L1 Write Back(dirty cache)";
+    Atomic,                 desc="Atomic Op";
+    AtomicDone,             desc="AtomicOps Complete";
+    AtomicNotDone,          desc="AtomicOps not Complete";
+    Data,                   desc="data messgae";
+    // Coming from this TCC
+    L2_Repl,                desc="L2 Replacement";
+    // Probes
+    PrbInv,                 desc="Invalidating probe";
+    // Coming from Memory Controller
+    WBAck,                  desc="writethrough ack from memory";
+  }
+
+  // STATES
+  state_declaration(State, desc="TCC State", default="TCC_State_I") {
+    M, AccessPermission:Read_Write, desc="Modified(dirty cache only)";
+    W, AccessPermission:Read_Write, desc="Written(dirty cache only)";
+    V, AccessPermission:Read_Only,  desc="Valid";
+    I, AccessPermission:Invalid,    desc="Invalid";
+    IV, AccessPermission:Busy,      desc="Waiting for Data";
+    WI, AccessPermission:Busy,      desc="Waiting on Writethrough Ack";
+    A, AccessPermission:Busy,       desc="Invalid waiting on atomic Data";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+
+  // STRUCTURES
+
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff from memory?)";
+    DataBlock DataBlk,          desc="Data for the block";
+    WriteMask writeMask,        desc="Dirty byte mask";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,     desc="Transient state";
+    DataBlock DataBlk,  desc="data for the block";
+    bool Dirty,         desc="Is the data dirty?";
+    bool Shared,        desc="Victim hit by shared probe";
+    MachineID From,     desc="Waiting for writeback from...";
+    NetDest Destination, desc="Data destination";
+    int numAtomics,     desc="number remaining atomics";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<TCC_TBE>", constructor="m_number_of_TBEs";
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+
+
+  // FUNCTION DEFINITIONS
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  MachineID getPeer(MachineID mach) {
+    return createMachineID(MachineType:RegionBuffer, intToID(regionBufferNum));
+  }
+
+ Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+    return static_cast(Entry, "pointer", L2cache.lookup(addr));
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    return getCacheEntry(addr).DataBlk;
+  }
+
+  bool presentOrAvail(Addr addr) {
+    return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr);
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if (is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+        tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+        cache_entry.CacheState := state;
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes +
+        functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return TCC_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return TCC_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(TCC_State_to_permission(state));
+    }
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      L2cache.recordRequestType(CacheRequestType:DataArrayRead,addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      L2cache.recordRequestType(CacheRequestType:DataArrayWrite,addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      L2cache.recordRequestType(CacheRequestType:TagArrayRead,addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      L2cache.recordRequestType(CacheRequestType:TagArrayWrite,addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+
+  // ** OUT_PORTS **
+
+  // Three classes of ports
+  // Class 1: downward facing network links to NB
+  out_port(requestToNB_out, CPURequestMsg, requestToNB);
+  out_port(responseToNB_out, ResponseMsg, responseToNB);
+  out_port(unblockToNB_out, UnblockMsg, unblockToNB);
+
+  // Class 2: upward facing ports to GPU cores
+  out_port(responseToCore_out, ResponseMsg, responseToCore);
+
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+  //
+  // request queue going to NB
+  //
+
+
+// ** IN_PORTS **
+  in_port(triggerQueue_in, TiggerMsg, triggerQueue) {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (tbe.numAtomics == 0) {
+            trigger(Event:AtomicDone, in_msg.addr, cache_entry, tbe);
+        } else {
+            trigger(Event:AtomicNotDone, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+
+
+  in_port(responseFromNB_in, ResponseMsg, responseFromNB) {
+    if (responseFromNB_in.isReady(clockEdge())) {
+      peek(responseFromNB_in, ResponseMsg, block_on="addr") {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+          if(presentOrAvail(in_msg.addr)) {
+            trigger(Event:Data, in_msg.addr, cache_entry, tbe);
+          } else {
+            Addr victim :=  L2cache.cacheProbe(in_msg.addr);
+            trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+          trigger(Event:WBAck, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+
+  // Finally handling incoming requests (from TCP) and probes (from NB).
+
+  in_port(probeNetwork_in, NBProbeRequestMsg, probeFromNB) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, NBProbeRequestMsg) {
+        DPRINTF(RubySlicc, "%s\n", in_msg);
+        DPRINTF(RubySlicc, "machineID: %s\n", machineID);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+      }
+    }
+  }
+
+
+  in_port(coreRequestNetwork_in, CPURequestMsg, requestFromTCP, rank=0) {
+    if (coreRequestNetwork_in.isReady(clockEdge())) {
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+            if(WB) {
+                if(presentOrAvail(in_msg.addr)) {
+                    trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry, tbe);
+                } else {
+                    Addr victim :=  L2cache.cacheProbe(in_msg.addr);
+                    trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+                }
+            } else {
+                trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
+            }
+        } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+          trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
+          trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+        } else {
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+  // BEGIN ACTIONS
+
+  action(i_invL2, "i", desc="invalidate TCC cache block") {
+    if (is_valid(cache_entry)) {
+        L2cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  // Data available at TCC. Send the DATA to TCP
+  action(sd_sendData, "sd", desc="send Shared response") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := false;
+        out_msg.State := CoherenceState:Shared;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+
+  // Data was not available at TCC. So, TCC forwarded the request to
+  // directory and directory responded back with data. Now, forward the
+  // DATA to TCP and send the unblock ack back to directory.
+  action(sdr_sendDataResponse, "sdr", desc="send Shared response") {
+    enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:TDSysResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination := tbe.Destination;
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := false;
+      out_msg.State := CoherenceState:Shared;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+    enqueue(unblockToNB_out, UnblockMsg, 1) {
+      out_msg.addr := address;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+
+  action(rd_requestData, "r", desc="Miss in L2, pass on") {
+    if(tbe.Destination.count()==1){
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+          out_msg.addr := address;
+          out_msg.Type := in_msg.Type;
+          out_msg.Requestor := machineID;
+          out_msg.Destination.add(getPeer(machineID));
+          out_msg.Shared := false; // unneeded for this request
+          out_msg.MessageSize := in_msg.MessageSize;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+      }
+    }
+  }
+
+  action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+    peek(responseFromNB_in, ResponseMsg) {
+        enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:TDSysWBAck;
+          out_msg.Destination.clear();
+          out_msg.Destination.add(in_msg.WTRequestor);
+          out_msg.Sender := machineID;
+          out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        }
+    }
+  }
+
+  action(swb_sendWBAck, "swb", desc="send WB Ack") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysWBAck;
+        out_msg.Destination.clear();
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+      }
+    }
+  }
+
+  action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") {
+    peek(responseFromNB_in, ResponseMsg) {
+        enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:TDSysResp;
+          out_msg.Destination.add(in_msg.WTRequestor);
+          out_msg.Sender := machineID;
+          out_msg.MessageSize := in_msg.MessageSize;
+          out_msg.DataBlk := in_msg.DataBlk;
+        }
+    }
+  }
+  action(sd2rb_sendDone2RegionBuffer, "sd2rb", desc="Request finished, send done ack") {
+    enqueue(unblockToNB_out, UnblockMsg, 1) {
+      out_msg.addr := address;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.DoneAck := true;
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      if (is_valid(tbe)) {
+          out_msg.Dirty := tbe.Dirty;
+      } else {
+          out_msg.Dirty := false;
+      }
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(a_allocateBlock, "a", desc="allocate TCC block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L2cache.allocate(address, new Entry));
+      cache_entry.writeMask.clear();
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    if (is_invalid(tbe)) {
+      check_allocate(TBEs);
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      tbe.Destination.clear();
+      tbe.numAtomics := 0;
+    }
+    if (coreRequestNetwork_in.isReady(clockEdge())) {
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type == CoherenceRequestType:Atomic){
+          tbe.Destination.add(in_msg.Requestor);
+        }
+      }
+    }
+  }
+
+  action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") {
+    tbe.Destination.clear();
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(wcb_writeCacheBlock, "wcb", desc="write data to TCC") {
+    peek(responseFromNB_in, ResponseMsg) {
+      cache_entry.DataBlk := in_msg.DataBlk;
+      DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
+    }
+  }
+
+  action(wdb_writeDirtyBytes, "wdb", desc="write data to TCC") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      cache_entry.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask);
+      cache_entry.writeMask.orMask(in_msg.writeMask);
+      DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
+    }
+  }
+
+  action(wt_writeThrough, "wt", desc="write through data") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+        out_msg.addr := address;
+        out_msg.Requestor := machineID;
+        out_msg.WTRequestor := in_msg.Requestor;
+        out_msg.Destination.add(getPeer(machineID));
+        out_msg.MessageSize := MessageSizeType:Data;
+        out_msg.Type := CoherenceRequestType:WriteThrough;
+        out_msg.Dirty := true;
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.writeMask.orMask(in_msg.writeMask);
+      }
+    }
+  }
+
+  action(wb_writeBack, "wb", desc="write back data") {
+    enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      out_msg.WTRequestor := machineID;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Data;
+      out_msg.Type := CoherenceRequestType:WriteThrough;
+      out_msg.Dirty := true;
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.writeMask.orMask(cache_entry.writeMask);
+    }
+  }
+
+  action(at_atomicThrough, "at", desc="write back data") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+        out_msg.addr := address;
+        out_msg.Requestor := machineID;
+        out_msg.WTRequestor := in_msg.Requestor;
+        out_msg.Destination.add(getPeer(machineID));
+        out_msg.MessageSize := MessageSizeType:Data;
+        out_msg.Type := CoherenceRequestType:Atomic;
+        out_msg.Dirty := true;
+        out_msg.writeMask.orMask(in_msg.writeMask);
+      }
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(responseToNB_out, ResponseMsg, 1) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC, L3  respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+  action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
+    L2cache.setMRU(address);
+  }
+
+  action(p_popRequestQueue, "p", desc="pop request queue") {
+    coreRequestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="pop response queue") {
+    responseFromNB_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+  action(zz_recycleRequestQueue, "z", desc="stall"){
+    coreRequestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+
+  action(ina_incrementNumAtomics, "ina", desc="inc num atomics") {
+    tbe.numAtomics := tbe.numAtomics + 1;
+  }
+
+
+  action(dna_decrementNumAtomics, "dna", desc="dec num atomics") {
+    tbe.numAtomics := tbe.numAtomics - 1;
+    if (tbe.numAtomics==0) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:AtomicDone;
+      }
+    }
+  }
+
+  action(ptr_popTriggerQueue, "ptr", desc="pop Trigger") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  // END ACTIONS
+
+  // BEGIN TRANSITIONS
+  // transitions from base
+  // Assumptions for ArrayRead/Write
+  // TBE checked before tags
+  // Data Read/Write requires Tag Read
+
+  transition(WI, {RdBlk, WrVicBlk, Atomic, WrVicBlkBack}) {TagArrayRead} {
+    zz_recycleRequestQueue;
+  }
+  transition(A, {RdBlk, WrVicBlk, WrVicBlkBack}) {TagArrayRead} {
+    zz_recycleRequestQueue;
+  }
+  transition(IV, {WrVicBlk, Atomic, WrVicBlkBack}) {TagArrayRead} {
+    zz_recycleRequestQueue;
+  }
+  transition({M, V}, RdBlk) {TagArrayRead, DataArrayRead} {
+    sd_sendData;
+    ut_updateTag;
+    p_popRequestQueue;
+  }
+  transition(W, RdBlk, WI) {TagArrayRead, DataArrayRead} {
+    t_allocateTBE;
+    wb_writeBack;
+  }
+
+  transition(I, RdBlk, IV) {TagArrayRead} {
+    t_allocateTBE;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+  transition(IV, RdBlk) {
+    t_allocateTBE;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+  transition({V, I},Atomic, A) {TagArrayRead} {
+    i_invL2;
+    t_allocateTBE;
+    at_atomicThrough;
+    ina_incrementNumAtomics;
+    p_popRequestQueue;
+  }
+
+  transition(A, Atomic) {
+    at_atomicThrough;
+    ina_incrementNumAtomics;
+    p_popRequestQueue;
+  }
+
+  transition({M, W}, Atomic, WI) {TagArrayRead} {
+    t_allocateTBE;
+    wb_writeBack;
+  }
+
+  // Cahceblock stays in I state which implies
+  // this TCC is a write-no-allocate cache
+  transition(I, WrVicBlk) {TagArrayRead} {
+    wt_writeThrough;
+    p_popRequestQueue;
+  }
+
+  transition(V, WrVicBlk) {TagArrayRead, DataArrayWrite} {
+    ut_updateTag;
+    wdb_writeDirtyBytes;
+    wt_writeThrough;
+    p_popRequestQueue;
+  }
+
+  transition({V, M}, WrVicBlkBack, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    ut_updateTag;
+    swb_sendWBAck;
+    wdb_writeDirtyBytes;
+    p_popRequestQueue;
+  }
+
+  transition(W, WrVicBlkBack) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    ut_updateTag;
+    swb_sendWBAck;
+    wdb_writeDirtyBytes;
+    p_popRequestQueue;
+  }
+
+  transition(I, WrVicBlkBack, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocateBlock;
+    ut_updateTag;
+    swb_sendWBAck;
+    wdb_writeDirtyBytes;
+    p_popRequestQueue;
+  }
+
+  transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} {
+    t_allocateTBE;
+    wb_writeBack;
+    i_invL2;
+  }
+
+  transition({I, V}, L2_Repl, I) {TagArrayRead, TagArrayWrite} {
+    i_invL2;
+  }
+
+  transition({A, IV, WI}, L2_Repl) {
+    i_invL2;
+  }
+
+  transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition(M, PrbInv, W) {TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition(W, PrbInv) {TagArrayRead} {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({A, IV, WI}, PrbInv) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocateBlock;
+    ut_updateTag;
+    wcb_writeCacheBlock;
+    sdr_sendDataResponse;
+    sd2rb_sendDone2RegionBuffer;
+    pr_popResponseQueue;
+    dt_deallocateTBE;
+  }
+
+  transition(A, Data) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocateBlock;
+    ar_sendAtomicResponse;
+    sd2rb_sendDone2RegionBuffer;
+    dna_decrementNumAtomics;
+    pr_popResponseQueue;
+  }
+
+  transition(A, AtomicDone, I) {TagArrayRead, TagArrayWrite} {
+    dt_deallocateTBE;
+    ptr_popTriggerQueue;
+  }
+
+  transition(A, AtomicNotDone) {TagArrayRead} {
+    ptr_popTriggerQueue;
+  }
+
+  //M,W should not see WBAck as the cache is in WB mode
+  //WBAcks do not need to check tags
+  transition({I, V, IV, A}, WBAck) {
+    w_sendResponseWBAck;
+    sd2rb_sendDone2RegionBuffer;
+    pr_popResponseQueue;
+  }
+
+  transition(WI, WBAck,I) {
+    sd2rb_sendDone2RegionBuffer;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+}
diff --git a/src/mem/protocol/GPU_VIPER_Region.slicc b/src/mem/protocol/GPU_VIPER_Region.slicc
new file mode 100644
index 000000000..cbfef9de3
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER_Region.slicc
@@ -0,0 +1,11 @@
+protocol "GPU_VIPER_Region";
+include "RubySlicc_interfaces.slicc";
+include "MOESI_AMD_Base-msg.sm";
+include "MOESI_AMD_Base-Region-CorePair.sm";
+include "MOESI_AMD_Base-L3cache.sm";
+include "MOESI_AMD_Base-Region-dir.sm";
+include "GPU_VIPER_Region-TCC.sm";
+include "GPU_VIPER-TCP.sm";
+include "GPU_VIPER-SQC.sm";
+include "MOESI_AMD_Base-RegionDir.sm";
+include "MOESI_AMD_Base-RegionBuffer.sm";
diff --git a/src/mem/protocol/MOESI_AMD_Base-CorePair.sm b/src/mem/protocol/MOESI_AMD_Base-CorePair.sm
new file mode 100644
index 000000000..76fe77230
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-CorePair.sm
@@ -0,0 +1,2904 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:CorePair, "CP-like Core Coherence")
+ : Sequencer * sequencer;
+   Sequencer * sequencer1;
+   CacheMemory * L1Icache;
+   CacheMemory * L1D0cache;
+   CacheMemory * L1D1cache;
+   CacheMemory * L2cache;     // func mem logic looks in this CacheMemory
+   bool send_evictions := "False";
+   Cycles issue_latency := 5;  // time to send data down to NB
+   Cycles l2_hit_latency := 18;
+
+  // BEGIN Core Buffers
+
+  // To the Network
+  MessageBuffer * requestFromCore, network="To", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseFromCore, network="To", virtual_network="2", vnet_type="response";
+  MessageBuffer * unblockFromCore, network="To", virtual_network="4", vnet_type="unblock";
+
+  // From the Network
+  MessageBuffer * probeToCore, network="From", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseToCore, network="From", virtual_network="2", vnet_type="response";
+
+  MessageBuffer * mandatoryQueue;
+
+  MessageBuffer * triggerQueue, ordered="true";
+
+  // END Core Buffers
+
+{
+  // BEGIN STATES
+  state_declaration(State, desc="Cache states", default="CorePair_State_I") {
+
+    // Base States
+    I, AccessPermission:Invalid, desc="Invalid";
+    S, AccessPermission:Read_Only, desc="Shared";
+    E0, AccessPermission:Read_Write, desc="Exclusive with Cluster 0 ownership";
+    E1, AccessPermission:Read_Write, desc="Exclusive with Cluster 1 ownership";
+    Es, AccessPermission:Read_Write, desc="Exclusive in core";
+    O, AccessPermission:Read_Only, desc="Owner state in core, both clusters and other cores may be sharing line";
+    Ms, AccessPermission:Read_Write, desc="Modified in core, both clusters may be sharing line";
+    M0, AccessPermission:Read_Write, desc="Modified with cluster ownership";
+    M1, AccessPermission:Read_Write, desc="Modified with cluster ownership";
+
+    // Transient States
+    I_M0, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet";
+    I_M1, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet";
+    I_M0M1, AccessPermission:Busy, desc="Was in I_M0, got a store request from other cluster as well";
+    I_M1M0, AccessPermission:Busy, desc="Was in I_M1, got a store request from other cluster as well";
+    I_M0Ms, AccessPermission:Busy, desc="Was in I_M0, got a load request from other cluster as well";
+    I_M1Ms, AccessPermission:Busy, desc="Was in I_M1, got a load request from other cluster as well";
+    I_E0S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet";
+    I_E1S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet";
+    I_ES, AccessPermission:Busy, desc="S_F got hit by invalidating probe, RdBlk response needs to go to both clusters";
+
+    IF_E0S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E0S but expecting a L2_to_L1D0 trigger, just drop when receive";
+    IF_E1S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E1S but expecting a L2_to_L1D1 trigger, just drop when receive";
+    IF_ES, AccessPermission:Busy, desc="same, but waiting for two fills";
+    IF0_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one";
+    IF1_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one";
+    F_S0, AccessPermission:Busy, desc="same, but going to S0 when trigger received";
+    F_S1, AccessPermission:Busy, desc="same, but going to S1 when trigger received";
+
+    ES_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for clean writeback ack";
+    MO_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for dirty writeback ack";
+    MO_S0, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS";
+    MO_S1, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS";
+    S_F0, AccessPermission:Read_Only,  desc="Shared, filling L1";
+    S_F1, AccessPermission:Read_Only,  desc="Shared, filling L1";
+    S_F, AccessPermission:Read_Only,   desc="Shared, filling L1";
+    O_F0, AccessPermission:Read_Only,  desc="Owned, filling L1";
+    O_F1, AccessPermission:Read_Only,  desc="Owned, filling L1";
+    O_F,  AccessPermission:Read_Only,  desc="Owned, filling L1";
+    Si_F0, AccessPermission:Read_Only, desc="Shared, filling icache";
+    Si_F1, AccessPermission:Read_Only, desc="Shared, filling icache";
+    S_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    S_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    O_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    O_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    S0, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 0, waiting for response";
+    S1, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 1, waiting for response";
+
+    Es_F0, AccessPermission:Read_Write, desc="Es, Cluster read, filling";
+    Es_F1, AccessPermission:Read_Write, desc="Es, Cluster read, filling";
+    Es_F, AccessPermission:Read_Write,  desc="Es, other cluster read, filling";
+    E0_F, AccessPermission:Read_Write, desc="E0, cluster read, filling";
+    E1_F, AccessPermission:Read_Write, desc="...";
+    E0_Es, AccessPermission:Read_Write, desc="...";
+    E1_Es, AccessPermission:Read_Write, desc="...";
+    Ms_F0, AccessPermission:Read_Write, desc="...";
+    Ms_F1, AccessPermission:Read_Write, desc="...";
+    Ms_F, AccessPermission:Read_Write,  desc="...";
+    M0_F, AccessPermission:Read_Write, desc="...";
+    M0_Ms, AccessPermission:Read_Write, desc="...";
+    M1_F, AccessPermission:Read_Write, desc="...";
+    M1_Ms, AccessPermission:Read_Write, desc="...";
+
+    I_C, AccessPermission:Invalid, desc="Invalid, but waiting for WBAck from NB from canceled writeback";
+    S0_C, AccessPermission:Busy, desc="MO_S0 hit by invalidating probe, waiting for WBAck form NB for canceled WB";
+    S1_C, AccessPermission:Busy, desc="MO_S1 hit by invalidating probe, waiting for WBAck form NB for canceled WB";
+    S_C, AccessPermission:Busy, desc="S*_C got NB_AckS, still waiting for WBAck";
+
+  } // END STATES
+
+  // BEGIN EVENTS
+  enumeration(Event, desc="CP Events") {
+    // CP Initiated events
+    C0_Load_L1miss,            desc="Cluster 0 load, L1 missed";
+    C0_Load_L1hit,             desc="Cluster 0 load, L1 hit";
+    C1_Load_L1miss,            desc="Cluster 1 load L1 missed";
+    C1_Load_L1hit,             desc="Cluster 1 load L1 hit";
+    Ifetch0_L1hit,             desc="Instruction fetch, hit in the L1";
+    Ifetch1_L1hit,             desc="Instruction fetch, hit in the L1";
+    Ifetch0_L1miss,            desc="Instruction fetch, missed in the L1";
+    Ifetch1_L1miss,            desc="Instruction fetch, missed in the L1";
+    C0_Store_L1miss,           desc="Cluster 0 store missed in L1";
+    C0_Store_L1hit,            desc="Cluster 0 store hit in L1";
+    C1_Store_L1miss,           desc="Cluster 1 store missed in L1";
+    C1_Store_L1hit,            desc="Cluster 1 store hit in L1";
+    // NB Initiated events
+    NB_AckS,             desc="NB Ack to Core Request";
+    NB_AckM,             desc="NB Ack to Core Request";
+    NB_AckE,             desc="NB Ack to Core Request";
+
+    NB_AckWB,            desc="NB Ack for writeback";
+
+    // Memory System initiatied events
+    L1I_Repl,           desc="Replace address from L1I"; // Presumed clean
+    L1D0_Repl,           desc="Replace address from L1D0"; // Presumed clean
+    L1D1_Repl,           desc="Replace address from L1D1"; // Presumed clean
+    L2_Repl,            desc="Replace address from L2";
+
+    L2_to_L1D0,           desc="L1 fill from L2";
+    L2_to_L1D1,           desc="L1 fill from L2";
+    L2_to_L1I,           desc="L1 fill from L2";
+
+    // Probe Events
+    PrbInvData,         desc="probe, return O or M data";
+    PrbInv,             desc="probe, no need for data";
+    PrbShrData,         desc="probe downgrade, return O or M data";
+
+  }  // END EVENTS
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    L1D0DataArrayRead,    desc="Read the data array";
+    L1D0DataArrayWrite,   desc="Write the data array";
+    L1D0TagArrayRead,     desc="Read the data array";
+    L1D0TagArrayWrite,    desc="Write the data array";
+    L1D1DataArrayRead,    desc="Read the data array";
+    L1D1DataArrayWrite,   desc="Write the data array";
+    L1D1TagArrayRead,     desc="Read the data array";
+    L1D1TagArrayWrite,    desc="Write the data array";
+    L1IDataArrayRead,     desc="Read the data array";
+    L1IDataArrayWrite,    desc="Write the data array";
+    L1ITagArrayRead,      desc="Read the data array";
+    L1ITagArrayWrite,     desc="Write the data array";
+    L2DataArrayRead,      desc="Read the data array";
+    L2DataArrayWrite,     desc="Write the data array";
+    L2TagArrayRead,       desc="Read the data array";
+    L2TagArrayWrite,      desc="Write the data array";
+  }
+
+
+  // BEGIN STRUCTURE DEFINITIONS
+
+
+  // Cache Entry
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff than memory)?";
+    DataBlock DataBlk,          desc="data for the block";
+    bool FromL2, default="false", desc="block just moved from L2";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,             desc="Transient state";
+    DataBlock DataBlk,       desc="data for the block, required for concurrent writebacks";
+    bool Dirty,              desc="Is the data dirty (different than memory)?";
+    int NumPendingMsgs,      desc="Number of acks/data messages that this processor is waiting for";
+    bool Shared,             desc="Victim hit by shared probe";
+   }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<CorePair_TBE>", constructor="m_number_of_TBEs";
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  // END STRUCTURE DEFINITIONS
+
+  // BEGIN INTERNAL FUNCTIONS
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  bool addressInCore(Addr addr) {
+    return (L2cache.isTagPresent(addr) || L1Icache.isTagPresent(addr) || L1D0cache.isTagPresent(addr) || L1D1cache.isTagPresent(addr));
+  }
+
+  Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+    Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address));
+    return L2cache_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    } else {
+      return getCacheEntry(addr).DataBlk;
+    }
+  }
+
+  Entry getL1CacheEntry(Addr addr, int cluster), return_by_pointer="yes" {
+    if (cluster == 0) {
+      Entry L1D0_entry := static_cast(Entry, "pointer", L1D0cache.lookup(addr));
+      return L1D0_entry;
+    } else {
+      Entry L1D1_entry := static_cast(Entry, "pointer", L1D1cache.lookup(addr));
+      return L1D1_entry;
+    }
+  }
+
+  Entry getICacheEntry(Addr addr), return_by_pointer="yes" {
+    Entry c_entry := static_cast(Entry, "pointer", L1Icache.lookup(addr));
+    return c_entry;
+  }
+
+  bool presentOrAvail2(Addr addr) {
+    return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr);
+  }
+
+  bool presentOrAvailI(Addr addr) {
+    return L1Icache.isTagPresent(addr) || L1Icache.cacheAvail(addr);
+  }
+
+  bool presentOrAvailD0(Addr addr) {
+    return L1D0cache.isTagPresent(addr) || L1D0cache.cacheAvail(addr);
+  }
+
+  bool presentOrAvailD1(Addr addr) {
+    return L1D1cache.isTagPresent(addr) || L1D1cache.cacheAvail(addr);
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if(is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+      tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+      cache_entry.CacheState := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return CorePair_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return CorePair_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(CorePair_State_to_permission(state));
+    }
+  }
+
+  MachineType testAndClearLocalHit(Entry cache_entry) {
+    assert(is_valid(cache_entry));
+    if (cache_entry.FromL2) {
+      cache_entry.FromL2 := false;
+      return MachineType:L2Cache;
+    } else {
+      return MachineType:L1Cache;
+    }
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L1D0DataArrayRead) {
+        L1D0cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L1D0DataArrayWrite) {
+        L1D0cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L1D0TagArrayRead) {
+        L1D0cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L1D0TagArrayWrite) {
+        L1D0cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    } else if (request_type == RequestType:L1D1DataArrayRead) {
+        L1D1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L1D1DataArrayWrite) {
+        L1D1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L1D1TagArrayRead) {
+        L1D1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L1D1TagArrayWrite) {
+        L1D1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    } else if (request_type == RequestType:L1IDataArrayRead) {
+        L1Icache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L1IDataArrayWrite) {
+        L1Icache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L1ITagArrayRead) {
+        L1Icache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L1ITagArrayWrite) {
+        L1Icache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    } else if (request_type == RequestType:L2DataArrayRead) {
+        L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L2DataArrayWrite) {
+        L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L2TagArrayRead) {
+        L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L2TagArrayWrite) {
+        L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L2DataArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L2DataArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L2TagArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L2TagArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if  (request_type == RequestType:L1D0DataArrayRead) {
+      return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1D0DataArrayWrite) {
+      return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1D0TagArrayRead) {
+      return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1D0TagArrayWrite) {
+      return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1D1DataArrayRead) {
+      return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1D1DataArrayWrite) {
+      return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1D1TagArrayRead) {
+      return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1D1TagArrayWrite) {
+      return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1IDataArrayRead) {
+      return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1IDataArrayWrite) {
+      return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1ITagArrayRead) {
+      return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1ITagArrayWrite) {
+      return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+
+    } else {
+      return true;
+    }
+  }
+
+  // END INTERNAL FUNCTIONS
+
+  // ** OUT_PORTS **
+
+  out_port(requestNetwork_out, CPURequestMsg, requestFromCore);
+  out_port(responseNetwork_out, ResponseMsg, responseFromCore);
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+  out_port(unblockNetwork_out, UnblockMsg, unblockFromCore);
+
+  // ** IN_PORTS **
+
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, block_on="addr") {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == TriggerType:L2_to_L1) {
+          if (in_msg.Dest == CacheId:L1I) {
+            trigger(Event:L2_to_L1I, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Dest == CacheId:L1D0) {
+            trigger(Event:L2_to_L1D0, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Dest == CacheId:L1D1) {
+            trigger(Event:L2_to_L1D1, in_msg.addr, cache_entry, tbe);
+          } else {
+            error("unexpected trigger dest");
+          }
+        }
+      }
+    }
+  }
+
+
+  in_port(probeNetwork_in, NBProbeRequestMsg, probeToCore) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, NBProbeRequestMsg, block_on="addr") {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == ProbeRequestType:PrbInv) {
+          if (in_msg.ReturnData) {
+            trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+          assert(in_msg.ReturnData);
+          trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+
+  // ResponseNetwork
+  in_port(responseToCore_in, ResponseMsg, responseToCore) {
+    if (responseToCore_in.isReady(clockEdge())) {
+      peek(responseToCore_in, ResponseMsg, block_on="addr") {
+
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+          if (in_msg.State == CoherenceState:Modified) {
+              trigger(Event:NB_AckM, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.State == CoherenceState:Shared) {
+            trigger(Event:NB_AckS, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.State == CoherenceState:Exclusive) {
+            trigger(Event:NB_AckE, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+          trigger(Event:NB_AckWB, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+
+  // Nothing from the Unblock Network
+
+  // Mandatory Queue
+  in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+    if (mandatoryQueue_in.isReady(clockEdge())) {
+      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+
+        Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+        TBE tbe := TBEs.lookup(in_msg.LineAddress);
+
+        if (in_msg.Type == RubyRequestType:IFETCH) {
+          // FETCH ACCESS
+
+          if (L1Icache.isTagPresent(in_msg.LineAddress)) {
+            if (mod(in_msg.contextId, 2) == 0) {
+              trigger(Event:Ifetch0_L1hit, in_msg.LineAddress, cache_entry, tbe);
+            } else {
+              trigger(Event:Ifetch1_L1hit, in_msg.LineAddress, cache_entry, tbe);
+            }
+          } else {
+            if (presentOrAvail2(in_msg.LineAddress)) {
+              if (presentOrAvailI(in_msg.LineAddress)) {
+                if (mod(in_msg.contextId, 2) == 0) {
+                  trigger(Event:Ifetch0_L1miss, in_msg.LineAddress, cache_entry,
+                          tbe);
+                } else {
+                  trigger(Event:Ifetch1_L1miss, in_msg.LineAddress, cache_entry,
+                          tbe);
+                }
+              } else {
+                Addr victim := L1Icache.cacheProbe(in_msg.LineAddress);
+                trigger(Event:L1I_Repl, victim,
+                        getCacheEntry(victim), TBEs.lookup(victim));
+              }
+            } else { // Not present or avail in L2
+              Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+              trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+                      TBEs.lookup(victim));
+            }
+          }
+        } else {
+          // DATA ACCESS
+          if (mod(in_msg.contextId, 2) == 1) {
+            if (L1D1cache.isTagPresent(in_msg.LineAddress)) {
+              if (in_msg.Type == RubyRequestType:LD) {
+                trigger(Event:C1_Load_L1hit, in_msg.LineAddress, cache_entry,
+                        tbe);
+              } else {
+                // Stores must write through, make sure L2 avail.
+                if (presentOrAvail2(in_msg.LineAddress)) {
+                  trigger(Event:C1_Store_L1hit, in_msg.LineAddress, cache_entry,
+                          tbe);
+                } else {
+                  Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+                  trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+                          TBEs.lookup(victim));
+                }
+              }
+            } else {
+              if (presentOrAvail2(in_msg.LineAddress)) {
+                if (presentOrAvailD1(in_msg.LineAddress)) {
+                  if (in_msg.Type == RubyRequestType:LD) {
+                    trigger(Event:C1_Load_L1miss, in_msg.LineAddress,
+                            cache_entry, tbe);
+                  } else {
+                    trigger(Event:C1_Store_L1miss, in_msg.LineAddress,
+                            cache_entry, tbe);
+                  }
+                } else {
+                  Addr victim := L1D1cache.cacheProbe(in_msg.LineAddress);
+                  trigger(Event:L1D1_Repl, victim,
+                          getCacheEntry(victim), TBEs.lookup(victim));
+                }
+              } else { // not present or avail in L2
+                Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+                trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+              }
+            }
+          } else {
+            Entry L1D0cache_entry := getL1CacheEntry(in_msg.LineAddress, 0);
+            if (is_valid(L1D0cache_entry)) {
+              if (in_msg.Type == RubyRequestType:LD) {
+                trigger(Event:C0_Load_L1hit, in_msg.LineAddress, cache_entry,
+                    tbe);
+              } else {
+                if (presentOrAvail2(in_msg.LineAddress)) {
+                  trigger(Event:C0_Store_L1hit, in_msg.LineAddress, cache_entry,
+                      tbe);
+                } else {
+                  Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+                  trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+                      TBEs.lookup(victim));
+                }
+              }
+            } else {
+              if (presentOrAvail2(in_msg.LineAddress)) {
+                if (presentOrAvailD0(in_msg.LineAddress)) {
+                  if (in_msg.Type == RubyRequestType:LD) {
+                    trigger(Event:C0_Load_L1miss, in_msg.LineAddress,
+                        cache_entry, tbe);
+                  } else {
+                    trigger(Event:C0_Store_L1miss, in_msg.LineAddress,
+                            cache_entry, tbe);
+                  }
+                } else {
+                  Addr victim := L1D0cache.cacheProbe(in_msg.LineAddress);
+                  trigger(Event:L1D0_Repl, victim, getCacheEntry(victim),
+                          TBEs.lookup(victim));
+                }
+              } else {
+                Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+                trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+                        TBEs.lookup(victim));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+
+  // ACTIONS
+  action(ii_invIcache, "ii", desc="invalidate iCache") {
+    if (L1Icache.isTagPresent(address)) {
+      L1Icache.deallocate(address);
+    }
+  }
+
+  action(i0_invCluster, "i0", desc="invalidate cluster 0") {
+    if (L1D0cache.isTagPresent(address)) {
+      L1D0cache.deallocate(address);
+    }
+  }
+
+  action(i1_invCluster, "i1", desc="invalidate cluster 1") {
+    if (L1D1cache.isTagPresent(address)) {
+      L1D1cache.deallocate(address);
+    }
+  }
+
+  action(ib_invBothClusters, "ib", desc="invalidate both clusters") {
+    if (L1D0cache.isTagPresent(address)) {
+      L1D0cache.deallocate(address);
+    }
+    if (L1D1cache.isTagPresent(address)) {
+      L1D1cache.deallocate(address);
+    }
+  }
+
+  action(i2_invL2, "i2", desc="invalidate L2") {
+    if(is_valid(cache_entry)) {
+        L2cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(mru_setMRU, "mru", desc="Update LRU state") {
+    L2cache.setMRU(address);
+  }
+
+  action(mruD1_setD1cacheMRU, "mruD1", desc="Update LRU state") {
+    L1D1cache.setMRU(address);
+  }
+
+  action(mruD0_setD0cacheMRU, "mruD0", desc="Update LRU state") {
+    L1D0cache.setMRU(address);
+  }
+
+  action(mruI_setIcacheMRU, "mruI", desc="Update LRU state") {
+    L1Icache.setMRU(address);
+  }
+
+  action(n_issueRdBlk, "n", desc="Issue RdBlk") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlk;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      DPRINTF(RubySlicc,"%s\n",out_msg.Destination);
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkM;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkS;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(vd_victim, "vd", desc="Victimize M/O L2 Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      assert(is_valid(cache_entry));
+      out_msg.DataBlk := cache_entry.DataBlk;
+      assert(cache_entry.Dirty);
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicDirty;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:O) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+    }
+  }
+
+  action(vc_victim, "vc", desc="Victimize E/S L2 Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicClean;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:S) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+    }
+  }
+
+  action(a0_allocateL1D, "a0", desc="Allocate L1D0 Block") {
+    if (L1D0cache.isTagPresent(address) == false) {
+      L1D0cache.allocateVoid(address, new Entry);
+    }
+  }
+
+  action(a1_allocateL1D, "a1", desc="Allocate L1D1 Block") {
+    if (L1D1cache.isTagPresent(address) == false) {
+      L1D1cache.allocateVoid(address, new Entry);
+    }
+  }
+
+  action(ai_allocateL1I, "ai", desc="Allocate L1I Block") {
+    if (L1Icache.isTagPresent(address) == false) {
+      L1Icache.allocateVoid(address, new Entry);
+    }
+  }
+
+  action(a2_allocateL2, "a2", desc="Allocate L2 Block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L2cache.allocate(address, new Entry));
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    assert(is_valid(cache_entry));
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+    tbe.DataBlk := cache_entry.DataBlk;  // Data only used for WBs
+    tbe.Dirty := cache_entry.Dirty;
+    tbe.Shared := false;
+  }
+
+  action(d_deallocateTBE, "d", desc="Deallocate TBE") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+    mandatoryQueue_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+    responseToCore_in.dequeue(clockEdge());
+  }
+
+  action(pt_popTriggerQueue, "pt", desc="Pop Trigger Queue") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(il0_loadDone, "il0", desc="Cluster 0 i load done") {
+    Entry entry := getICacheEntry(address);
+    Entry l2entry := getCacheEntry(address); // Used for functional accesses
+    assert(is_valid(entry));
+    // L2 supplies data (functional accesses only look in L2, ok because L1
+    //                   writes through to L2)
+    sequencer.readCallback(address,
+                           l2entry.DataBlk,
+                           true,
+                           testAndClearLocalHit(entry));
+  }
+
+  action(il1_loadDone, "il1", desc="Cluster 1 i load done") {
+    Entry entry := getICacheEntry(address);
+    Entry l2entry := getCacheEntry(address); // Used for functional accesses
+    assert(is_valid(entry));
+    // L2 supplies data (functional accesses only look in L2, ok because L1
+    //                   writes through to L2)
+    sequencer1.readCallback(address,
+                            l2entry.DataBlk,
+                            true,
+                            testAndClearLocalHit(entry));
+  }
+
+  action(l0_loadDone, "l0", desc="Cluster 0 load done") {
+    Entry entry := getL1CacheEntry(address, 0);
+    Entry l2entry := getCacheEntry(address); // Used for functional accesses
+    assert(is_valid(entry));
+    // L2 supplies data (functional accesses only look in L2, ok because L1
+    //                   writes through to L2)
+    sequencer.readCallback(address,
+                           l2entry.DataBlk,
+                           true,
+                           testAndClearLocalHit(entry));
+  }
+
+  action(l1_loadDone, "l1", desc="Cluster 1 load done") {
+    Entry entry := getL1CacheEntry(address, 1);
+    Entry l2entry := getCacheEntry(address); // Used for functional accesses
+    assert(is_valid(entry));
+    // L2 supplies data (functional accesses only look in L2, ok because L1
+    //                   writes through to L2)
+    sequencer1.readCallback(address,
+                            l2entry.DataBlk,
+                            true,
+                            testAndClearLocalHit(entry));
+  }
+
+  action(xl0_loadDone, "xl0", desc="Cluster 0 load done") {
+    peek(responseToCore_in, ResponseMsg) {
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+              (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      Entry l2entry := getCacheEntry(address); // Used for functional accesses
+      DPRINTF(ProtocolTrace, "CP Load Done 0 -- address %s, data: %s\n", address, l2entry.DataBlk);
+      // L2 supplies data (functional accesses only look in L2, ok because L1
+      //                   writes through to L2)
+      sequencer.readCallback(address,
+                             l2entry.DataBlk,
+                             false,
+                             machineIDToMachineType(in_msg.Sender),
+                             in_msg.InitialRequestTime,
+                             in_msg.ForwardRequestTime,
+                             in_msg.ProbeRequestStartTime);
+    }
+  }
+
+  action(xl1_loadDone, "xl1", desc="Cluster 1 load done") {
+   peek(responseToCore_in, ResponseMsg) {
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+              (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      Entry l2entry := getCacheEntry(address); // Used for functional accesses
+      // L2 supplies data (functional accesses only look in L2, ok because L1
+      //                   writes through to L2)
+      sequencer1.readCallback(address,
+                              l2entry.DataBlk,
+                              false,
+                              machineIDToMachineType(in_msg.Sender),
+                              in_msg.InitialRequestTime,
+                              in_msg.ForwardRequestTime,
+                              in_msg.ProbeRequestStartTime);
+    }
+  }
+
+  action(xi0_loadDone, "xi0", desc="Cluster 0 i-load done") {
+    peek(responseToCore_in, ResponseMsg) {
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+              (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      Entry l2entry := getCacheEntry(address); // Used for functional accesses
+      // L2 supplies data (functional accesses only look in L2, ok because L1
+      //                   writes through to L2)
+      sequencer.readCallback(address,
+                             l2entry.DataBlk,
+                             false,
+                             machineIDToMachineType(in_msg.Sender),
+                             in_msg.InitialRequestTime,
+                             in_msg.ForwardRequestTime,
+                             in_msg.ProbeRequestStartTime);
+    }
+  }
+
+  action(xi1_loadDone, "xi1", desc="Cluster 1 i-load done") {
+    peek(responseToCore_in, ResponseMsg) {
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+              (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      Entry l2entry := getCacheEntry(address); // Used for functional accesses
+      // L2 supplies data (functional accesses only look in L2, ok because L1
+      //                   writes through to L2)
+      sequencer1.readCallback(address,
+                              l2entry.DataBlk,
+                              false,
+                              machineIDToMachineType(in_msg.Sender),
+                              in_msg.InitialRequestTime,
+                              in_msg.ForwardRequestTime,
+                              in_msg.ProbeRequestStartTime);
+    }
+  }
+
+  action(s0_storeDone, "s0", desc="Cluster 0 store done") {
+    Entry entry := getL1CacheEntry(address, 0);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    sequencer.writeCallback(address,
+                            cache_entry.DataBlk,
+                            true,
+                            testAndClearLocalHit(entry));
+    cache_entry.Dirty := true;
+    entry.DataBlk := cache_entry.DataBlk;
+    entry.Dirty := true;
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+  }
+
+  action(s1_storeDone, "s1", desc="Cluster 1 store done") {
+    Entry entry := getL1CacheEntry(address, 1);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    sequencer1.writeCallback(address,
+                             cache_entry.DataBlk,
+                             true,
+                             testAndClearLocalHit(entry));
+    cache_entry.Dirty := true;
+    entry.Dirty := true;
+    entry.DataBlk := cache_entry.DataBlk;
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+  }
+
+  action(xs0_storeDone, "xs0", desc="Cluster 0 store done") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getL1CacheEntry(address, 0);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+             (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      sequencer.writeCallback(address,
+                              cache_entry.DataBlk,
+                              false,
+                              machineIDToMachineType(in_msg.Sender),
+                              in_msg.InitialRequestTime,
+                              in_msg.ForwardRequestTime,
+                              in_msg.ProbeRequestStartTime);
+      cache_entry.Dirty := true;
+      entry.Dirty := true;
+      entry.DataBlk := cache_entry.DataBlk;
+      DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    }
+  }
+
+  action(xs1_storeDone, "xs1", desc="Cluster 1 store done") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getL1CacheEntry(address, 1);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+             (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      sequencer1.writeCallback(address,
+                               cache_entry.DataBlk,
+                               false,
+                               machineIDToMachineType(in_msg.Sender),
+                               in_msg.InitialRequestTime,
+                               in_msg.ForwardRequestTime,
+                               in_msg.ProbeRequestStartTime);
+      cache_entry.Dirty := true;
+      entry.Dirty := true;
+      entry.DataBlk := cache_entry.DataBlk;
+      DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    }
+  }
+
+  action(forward_eviction_to_cpu0, "fec0", desc="sends eviction information to processor0") {
+    if (send_evictions) {
+      DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address);
+      sequencer.evictionCallback(address);
+    }
+  }
+
+  action(forward_eviction_to_cpu1, "fec1", desc="sends eviction information to processor1") {
+    if (send_evictions) {
+      DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address);
+      sequencer1.evictionCallback(address);
+    }
+  }
+
+  action(ci_copyL2ToL1, "ci", desc="copy L2 data to L1") {
+    Entry entry := getICacheEntry(address);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    entry.Dirty := cache_entry.Dirty;
+    entry.DataBlk := cache_entry.DataBlk;
+    entry.FromL2 := true;
+  }
+
+  action(c0_copyL2ToL1, "c0", desc="copy L2 data to L1") {
+    Entry entry := getL1CacheEntry(address, 0);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    entry.Dirty := cache_entry.Dirty;
+    entry.DataBlk := cache_entry.DataBlk;
+    entry.FromL2 := true;
+  }
+
+  action(c1_copyL2ToL1, "c1", desc="copy L2 data to L1") {
+    Entry entry := getL1CacheEntry(address, 1);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    entry.Dirty := cache_entry.Dirty;
+    entry.DataBlk := cache_entry.DataBlk;
+    entry.FromL2 := true;
+  }
+
+  action(fi_L2ToL1, "fi", desc="L2 to L1 inst fill") {
+    enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+      out_msg.addr := address;
+      out_msg.Type := TriggerType:L2_to_L1;
+      out_msg.Dest := CacheId:L1I;
+    }
+  }
+
+  action(f0_L2ToL1, "f0", desc="L2 to L1 data fill") {
+    enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+      out_msg.addr := address;
+      out_msg.Type := TriggerType:L2_to_L1;
+      out_msg.Dest := CacheId:L1D0;
+    }
+  }
+
+  action(f1_L2ToL1, "f1", desc="L2 to L1 data fill") {
+    enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+      out_msg.addr := address;
+      out_msg.Type := TriggerType:L2_to_L1;
+      out_msg.Dest := CacheId:L1D1;
+    }
+  }
+
+  action(wi_writeIcache, "wi", desc="write data to icache (and l2)") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getICacheEntry(address);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      entry.DataBlk := in_msg.DataBlk;
+      entry.Dirty := in_msg.Dirty;
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(w0_writeDcache, "w0", desc="write data to dcache 0 (and l2)") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getL1CacheEntry(address, 0);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      DPRINTF(ProtocolTrace, "CP writeD0: address %s, data: %s\n", address, in_msg.DataBlk);
+      entry.DataBlk := in_msg.DataBlk;
+      entry.Dirty := in_msg.Dirty;
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(w1_writeDcache, "w1", desc="write data to dcache 1 (and l2)") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getL1CacheEntry(address, 1);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      entry.DataBlk := in_msg.DataBlk;
+      entry.Dirty := in_msg.Dirty;
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") {
+    peek(responseToCore_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:StaleNotif;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(map_Address_to_Directory(address));
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(wb_data, "wb", desc="write back data") {
+    peek(responseToCore_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:CPUData;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(map_Address_to_Directory(address));
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        if (tbe.Shared) {
+          out_msg.NbReqShared := true;
+        } else {
+          out_msg.NbReqShared := false;
+        }
+        out_msg.State := CoherenceState:Shared; // faux info
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.Dirty := false;
+      out_msg.Ntsl := true;
+      out_msg.Hit := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(ph_sendProbeResponseHit, "ph", desc="send probe ack PrbShrData, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      assert(addressInCore(address) || is_valid(tbe));
+      out_msg.Dirty := false;  // only true if sending back data i think
+      out_msg.Hit := true;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pb_sendProbeResponseBackprobe, "pb", desc="send probe ack PrbShrData, no data, check for L1 residence") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      if (addressInCore(address)) {
+        out_msg.Hit := true;
+      } else {
+        out_msg.Hit := false;
+      }
+      out_msg.Dirty := false;  // not sending back data, so def. not dirty
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.DataBlk := cache_entry.DataBlk;
+      assert(cache_entry.Dirty);
+      out_msg.Dirty := true;
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.DataBlk := cache_entry.DataBlk;
+      assert(cache_entry.Dirty);
+      out_msg.Dirty := true;
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(tbe));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.DataBlk := tbe.DataBlk;
+      assert(tbe.Dirty);
+      out_msg.Dirty := true;
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(s_setSharedFlip, "s", desc="hit by shared probe, status may be different") {
+    assert(is_valid(tbe));
+    tbe.Shared := true;
+  }
+
+  action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+    enqueue(unblockNetwork_out, UnblockMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(l2m_profileMiss, "l2m", desc="l2m miss profile") {
+    ++L2cache.demand_misses;
+  }
+
+  action(l10m_profileMiss, "l10m", desc="l10m miss profile") {
+    ++L1D0cache.demand_misses;
+  }
+
+  action(l11m_profileMiss, "l11m", desc="l11m miss profile") {
+    ++L1D1cache.demand_misses;
+  }
+
+  action(l1im_profileMiss, "l1lm", desc="l1im miss profile") {
+    ++L1Icache.demand_misses;
+  }
+
+  action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") {
+    probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(xx_recycleResponseQueue, "xx", desc="recycle response queue") {
+    responseToCore_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
+    mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  // END ACTIONS
+
+  // BEGIN TRANSITIONS
+
+  // transitions from base
+  transition(I, C0_Load_L1miss, I_E0S) {L1D0TagArrayRead, L2TagArrayRead} {
+    // track misses, if implemented
+    // since in I state, L2 miss as well
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    a2_allocateL2;
+    i1_invCluster;
+    ii_invIcache;
+    n_issueRdBlk;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, C1_Load_L1miss, I_E1S) {L1D1TagArrayRead, L2TagArrayRead} {
+    // track misses, if implemented
+    // since in I state, L2 miss as well
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    a2_allocateL2;
+    i0_invCluster;
+    ii_invIcache;
+    n_issueRdBlk;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, Ifetch0_L1miss, S0) {L1ITagArrayRead,L2TagArrayRead} {
+    // track misses, if implemented
+    // L2 miss as well
+    l2m_profileMiss;
+    l1im_profileMiss;
+    ai_allocateL1I;
+    a2_allocateL2;
+    ib_invBothClusters;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} {
+    // track misses, if implemented
+    // L2 miss as well
+    l2m_profileMiss;
+    l1im_profileMiss;
+    ai_allocateL1I;
+    a2_allocateL2;
+    ib_invBothClusters;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, C0_Store_L1miss, I_M0) {L1D0TagArrayRead, L2TagArrayRead} {
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    a2_allocateL2;
+    i1_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, C1_Store_L1miss, I_M1) {L1D0TagArrayRead, L2TagArrayRead} {
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    a2_allocateL2;
+    i0_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, C0_Load_L1miss, S_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, C1_Load_L1miss, S_F1) {L1D1TagArrayRead,L2TagArrayRead, L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, Ifetch0_L1miss, Si_F0) {L1ITagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+    l1im_profileMiss;
+    ai_allocateL1I;
+    fi_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, Ifetch1_L1miss, Si_F1) {L1ITagArrayRead,L2TagArrayRead, L2DataArrayRead} {
+    l1im_profileMiss;
+    ai_allocateL1I;
+    fi_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition({S}, {C0_Store_L1hit, C0_Store_L1miss}, S_M0) {L1D0TagArrayRead, L2TagArrayRead} {
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    mruD0_setD0cacheMRU;
+    i1_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition({S}, {C1_Store_L1hit, C1_Store_L1miss}, S_M1) {L1D1TagArrayRead, L2TagArrayRead} {
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    mruD1_setD1cacheMRU;
+    i0_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, C0_Load_L1miss, Es_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {  // can this be folded with S_F?
+    a0_allocateL1D;
+    l10m_profileMiss;
+    f0_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, C1_Load_L1miss, Es_F1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} {  // can this be folded with S_F?
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, Ifetch0_L1miss, S0) {L1ITagArrayRead, L1ITagArrayWrite, L2TagArrayRead, L2TagArrayWrite} {
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    ib_invBothClusters;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} {
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    ib_invBothClusters;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  // THES SHOULD NOT BE INSTANTANEOUS BUT OH WELL FOR NOW
+  transition(Es, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} {
+    a0_allocateL1D;
+    i1_invCluster;
+    s0_storeDone;   // instantaneous L1/L2 dirty - no writethrough delay
+    mruD0_setD0cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} {
+    a1_allocateL1D;
+    i0_invCluster;
+    s1_storeDone;
+    mruD1_setD1cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, C0_Load_L1miss, E0_F) {L1D0TagArrayRead,L2TagArrayRead, L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, C1_Load_L1miss, E0_Es) {L1D1TagArrayRead,  L2TagArrayRead, L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, Ifetch0_L1miss, S0) {L2TagArrayRead, L1ITagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    i0_invCluster;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    i0_invCluster;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a0_allocateL1D;
+    s0_storeDone;
+    mruD0_setD0cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, C1_Store_L1miss, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1TagArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    i0_invCluster;
+    s1_storeDone;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, C1_Load_L1miss, E1_F) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+     l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, C0_Load_L1miss, E1_Es) {L1D0TagArrayRead,  L2TagArrayRead, L2DataArrayRead} {
+    l11m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, Ifetch1_L1miss, S1) {L2TagArrayRead,  L1ITagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    i1_invCluster;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, Ifetch0_L1miss, S0) {L2TagArrayRead, L1ITagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    i1_invCluster;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite} {
+    a1_allocateL1D;
+    s1_storeDone;
+    mruD1_setD1cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, C0_Store_L1miss, M0) {L1D0TagArrayRead, L2TagArrayRead, L2TagArrayWrite, L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite} {
+     l10m_profileMiss;
+    a0_allocateL1D;
+    i1_invCluster;
+    s0_storeDone;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition({O}, {C0_Store_L1hit, C0_Store_L1miss}, O_M0) {L1D0TagArrayRead,L2TagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue CtoD
+    l10m_profileMiss;
+    a0_allocateL1D;
+    mruD0_setD0cacheMRU;
+    i1_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition({O}, {C1_Store_L1hit, C1_Store_L1miss}, O_M1) {L1D1TagArrayRead, L2TagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+     l11m_profileMiss;
+    a1_allocateL1D;
+    mruD1_setD1cacheMRU;
+    i0_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(O, C0_Load_L1miss, O_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(O, C1_Load_L1miss, O_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} {
+     l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(Ms, C0_Load_L1miss, Ms_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(Ms, C1_Load_L1miss, Ms_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition({Ms, M0, M1, O}, Ifetch0_L1miss, MO_S0) {L1ITagArrayRead, L2DataArrayRead, L2TagArrayRead} {
+    l2m_profileMiss;  // permissions miss
+    l1im_profileMiss;
+    ai_allocateL1I;
+    t_allocateTBE;
+    ib_invBothClusters;
+    vd_victim;
+//    i2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition({Ms, M0, M1, O}, Ifetch1_L1miss, MO_S1) {L1ITagArrayRead, L2TagArrayRead, L2DataArrayRead } {
+    l2m_profileMiss;  // permissions miss
+     l1im_profileMiss;
+    ai_allocateL1I;
+    t_allocateTBE;
+    ib_invBothClusters;
+    vd_victim;
+//    i2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition(Ms, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a0_allocateL1D;
+    i1_invCluster;
+    s0_storeDone;
+    mruD0_setD0cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(Ms, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a1_allocateL1D;
+    i0_invCluster;
+    s1_storeDone;
+    mruD1_setD1cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(M0, C0_Load_L1miss, M0_F) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+     l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(M0, C1_Load_L1miss, M0_Ms) {L2TagArrayRead, L2DataArrayRead,L1D0TagArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(M0, {C0_Store_L1hit, C0_Store_L1miss}) {L1D0TagArrayRead,L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead} {
+    a0_allocateL1D;
+    s0_storeDone;
+    mruD0_setD0cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(M0, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead, L2TagArrayWrite} {
+    a1_allocateL1D;
+    i0_invCluster;
+    s1_storeDone;
+    mruD1_setD1cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(M1, C0_Load_L1miss, M1_Ms) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(M1, C1_Load_L1miss, M1_F) {L1D1TagArrayRead,L2TagArrayRead, L2DataArrayRead} {
+    a1_allocateL1D;
+    f1_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(M1, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a0_allocateL1D;
+    i1_invCluster;
+    s0_storeDone;
+    mruD0_setD0cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(M1, {C1_Store_L1hit, C1_Store_L1miss}) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayWrite} {
+    a1_allocateL1D;
+    s1_storeDone;
+    mruD1_setD1cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  // end transitions from base
+
+  // Begin simple hit transitions
+  transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es,
+          Ms_F1, M0_Ms}, C0_Load_L1hit) {L1D0TagArrayRead, L1D0DataArrayRead} {
+    // track hits, if implemented
+    l0_loadDone;
+    mruD0_setD0cacheMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es,
+          Ms_F0, M1_Ms}, C1_Load_L1hit) {L1D1TagArrayRead, L1D1DataArrayRead} {
+    // track hits, if implemented
+    l1_loadDone;
+    mruD1_setD1cacheMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition({S, S_C, S_F0, S_F1, S_F}, Ifetch0_L1hit) {L1ITagArrayRead, L1IDataArrayRead} {
+    // track hits, if implemented
+    il0_loadDone;
+    mruI_setIcacheMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition({S, S_C, S_F0, S_F1, S_F}, Ifetch1_L1hit) {L1ITagArrayRead, L1IDataArrayWrite} {
+    // track hits, if implemented
+    il1_loadDone;
+    mruI_setIcacheMRU;
+    p_popMandatoryQueue;
+  }
+
+  // end simple hit transitions
+
+  // Transitions from transient states
+
+  // recycles
+  transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+          IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F,
+          E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, C0_Load_L1hit) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M1,
+          O_M1, S0, S1, I_C, S0_C, S1_C, S_C}, C0_Load_L1miss) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+          IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F,
+          E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, C1_Load_L1hit) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M0,
+          O_M0, S0, S1, I_C, S0_C, S1_C, S_C},  C1_Load_L1miss) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, {Ifetch0_L1hit, Ifetch1_L1hit}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES,
+          IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, ES_I, MO_I, S_F0, S_F1, S_F,
+          O_F0, O_F1, O_F, S_M0, S_M1, O_M0, O_M1, Es_F0, Es_F1, Es_F, E0_F,
+          E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, I_C,
+          S_C}, {Ifetch0_L1miss, Ifetch1_L1miss}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_E1S, IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, S_F1, O_F1,
+          Si_F0, Si_F1, S_M1, O_M1, S0, S1, Es_F1, E1_F, E0_Es, Ms_F1, M0_Ms,
+          M1_F, I_C, S0_C, S1_C, S_C}, {C0_Store_L1miss}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_E0S, IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1 S_F0, O_F0,
+          Si_F0, Si_F1, S_M0, O_M0, S0, S1, Es_F0, E0_F, E1_Es, Ms_F0, M0_F,
+          M1_Ms, I_C, S0_C, S1_C, S_C}, {C1_Store_L1miss}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+          IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M0, O_M0, Es_F0, Es_F1, Es_F, E0_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_Ms}, {C0_Store_L1hit}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+          IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M1,
+          O_M1, Es_F0, Es_F1, Es_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F,
+          M0_Ms, M1_F, M1_Ms}, {C1_Store_L1hit}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+          IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F,
+          E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, L1D0_Repl) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+          IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F,
+          E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, L1D1_Repl) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, L1I_Repl) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({S_C, S0_C, S1_C, S0, S1, Si_F0, Si_F1, I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, S_M0, O_M0, S_M1, O_M1, Es_F0, Es_F1, Es_F, E0_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, MO_S0, MO_S1, IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, L2_Repl) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, {NB_AckS,
+          PrbInvData, PrbInv, PrbShrData}) {} {
+    yy_recycleProbeQueue;  // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary.
+  }
+
+  transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES}, NB_AckE) {} {
+    xx_recycleResponseQueue;  // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary.
+  }
+
+  transition({E0_Es, E1_F, Es_F1}, C0_Load_L1miss, Es_F) {L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(S_F1, C0_Load_L1miss, S_F) {L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(O_F1, C0_Load_L1miss, O_F) {L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition({Ms_F1, M0_Ms, M1_F}, C0_Load_L1miss, Ms_F) {L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_M0, C1_Load_L1miss, I_M0Ms) {} {
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_M1, C0_Load_L1miss, I_M1Ms) {} {
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_M0, C1_Store_L1miss, I_M0M1) {} {
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_M1, C0_Store_L1miss, I_M1M0) {} {
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_E0S, C1_Load_L1miss, I_ES) {} {
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_E1S, C0_Load_L1miss, I_ES) {} {
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    p_popMandatoryQueue;
+  }
+
+  transition({E1_Es, E0_F, Es_F0}, C1_Load_L1miss, Es_F) {L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(S_F0, C1_Load_L1miss, S_F) {L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(O_F0, C1_Load_L1miss, O_F) {L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition({Ms_F0, M1_Ms, M0_F}, C1_Load_L1miss, Ms_F) { L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es, Ms_F1, M0_Ms}, L1D0_Repl) {L1D0TagArrayRead} {
+    i0_invCluster;
+  }
+
+  transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es, Ms_F0, M1_Ms}, L1D1_Repl) {L1D1TagArrayRead} {
+    i1_invCluster;
+  }
+
+  transition({S, S_C, S_F0, S_F1}, L1I_Repl) {L1ITagArrayRead} {
+    ii_invIcache;
+  }
+
+  transition({S, E0, E1, Es}, L2_Repl, ES_I) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead, L1D1TagArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    t_allocateTBE;
+    vc_victim;
+    ib_invBothClusters;
+    i2_invL2;
+    ii_invIcache;
+  }
+
+  transition({Ms, M0, M1, O}, L2_Repl, MO_I) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead, L1D1TagArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    t_allocateTBE;
+    vd_victim;
+    i2_invL2;
+    ib_invBothClusters;  // nothing will happen for D0 on M1, vice versa
+  }
+
+  transition(S0, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    wi_writeIcache;
+    xi0_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(S1, NB_AckS, S) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    wi_writeIcache;
+    xi1_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(S0_C, NB_AckS, S_C) {L1D0DataArrayWrite,L2DataArrayWrite} {
+    wi_writeIcache;
+    xi0_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(S1_C, NB_AckS, S_C) {L1D1DataArrayWrite, L2DataArrayWrite} {
+    wi_writeIcache;
+    xi1_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M0, NB_AckM, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} {
+    w0_writeDcache;
+    xs0_storeDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w1_writeDcache;
+    xs1_storeDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  // THESE MO->M1 should not be instantaneous but oh well for now.
+  transition(I_M0M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w0_writeDcache;
+    xs0_storeDone;
+    uu_sendUnblock;
+    i0_invCluster;
+    s1_storeDone;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M1M0, NB_AckM, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w1_writeDcache;
+    xs1_storeDone;
+    uu_sendUnblock;
+    i1_invCluster;
+    s0_storeDone;
+    pr_popResponseQueue;
+  }
+
+  // Above shoudl be more like this, which has some latency to xfer to L1
+  transition(I_M0Ms, NB_AckM, M0_Ms) {L1D0DataArrayWrite,L2DataArrayWrite} {
+    w0_writeDcache;
+    xs0_storeDone;
+    uu_sendUnblock;
+    f1_L2ToL1;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M1Ms, NB_AckM, M1_Ms) {L1D1DataArrayWrite, L2DataArrayWrite} {
+    w1_writeDcache;
+    xs1_storeDone;
+    uu_sendUnblock;
+    f0_L2ToL1;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E0S, NB_AckE, E0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w0_writeDcache;
+    xl0_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E1S, NB_AckE, E1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w1_writeDcache;
+    xl1_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_ES, NB_AckE, Es) {L1D1DataArrayWrite, L1D1TagArrayWrite, L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite } {
+    w0_writeDcache;
+    xl0_loadDone;
+    w1_writeDcache;
+    xl1_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E0S, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} {
+    w0_writeDcache;
+    xl0_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E1S, NB_AckS, S) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} {
+    w1_writeDcache;
+    xl1_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_ES, NB_AckS, S) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite,  L2DataArrayWrite} {
+    w0_writeDcache;
+    xl0_loadDone;
+    w1_writeDcache;
+    xl1_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(S_F0, L2_to_L1D0, S) {L1D0TagArrayWrite, L1D0DataArrayWrite,  L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(S_F1, L2_to_L1D1, S) {L1D1TagArrayWrite, L1D1DataArrayWrite,  L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Si_F0, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite,  L2TagArrayWrite, L2DataArrayRead} {
+    ci_copyL2ToL1;
+    mru_setMRU;
+    il0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Si_F1, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite,  L2TagArrayWrite, L2DataArrayRead} {
+    ci_copyL2ToL1;
+    mru_setMRU;
+    il1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(S_F, L2_to_L1D0, S_F1) { L1D0DataArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(S_F, L2_to_L1D1, S_F0) { L1D1DataArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(O_F0, L2_to_L1D0, O) { L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(O_F1, L2_to_L1D1, O) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(O_F, L2_to_L1D0, O_F1) { L1D0DataArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(O_F, L2_to_L1D1, O_F0) { L1D1DataArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(M1_F, L2_to_L1D1, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(M0_F, L2_to_L1D0, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Ms_F0, L2_to_L1D0, Ms) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Ms_F1, L2_to_L1D1, Ms) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Ms_F, L2_to_L1D0, Ms_F1) {L1D0DataArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Ms_F, L2_to_L1D1, Ms_F0) {L1IDataArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(M1_Ms, L2_to_L1D0, Ms) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(M0_Ms, L2_to_L1D1, Ms) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Es_F0, L2_to_L1D0, Es) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Es_F1, L2_to_L1D1, Es) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Es_F, L2_to_L1D0, Es_F1) {L2TagArrayRead, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Es_F, L2_to_L1D1, Es_F0) {L2TagArrayRead, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(E0_F, L2_to_L1D0, E0) {L2TagArrayRead, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(E1_F, L2_to_L1D1, E1) {L2TagArrayRead, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(E1_Es, L2_to_L1D0, Es) {L2TagArrayRead, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(E0_Es, L2_to_L1D1, Es) {L2TagArrayRead, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(IF_E0S, L2_to_L1D0, I_E0S) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF_E1S, L2_to_L1D1, I_E1S) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF_ES, L2_to_L1D0, IF1_ES) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF_ES, L2_to_L1D1, IF0_ES) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF0_ES, L2_to_L1D0, I_ES) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF1_ES, L2_to_L1D1, I_ES) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(F_S0, L2_to_L1I, S0) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(F_S1, L2_to_L1I, S1) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition({S_M0, O_M0}, NB_AckM, M0) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    mru_setMRU;
+    xs0_storeDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition({S_M1, O_M1}, NB_AckM, M1) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    mru_setMRU;
+    xs1_storeDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(MO_I, NB_AckWB, I) {L2TagArrayWrite} {
+    wb_data;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(ES_I, NB_AckWB, I) {L2TagArrayWrite} {
+    wb_data;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(MO_S0, NB_AckWB, S0) {L2TagArrayWrite} {
+    wb_data;
+    i2_invL2;
+    a2_allocateL2;
+    d_deallocateTBE; // FOO
+    nS_issueRdBlkS;
+    pr_popResponseQueue;
+  }
+
+  transition(MO_S1, NB_AckWB, S1) {L2TagArrayWrite} {
+    wb_data;
+    i2_invL2;
+    a2_allocateL2;
+    d_deallocateTBE; // FOO
+    nS_issueRdBlkS;
+    pr_popResponseQueue;
+  }
+
+  // Writeback cancel "ack"
+  transition(I_C, NB_AckWB, I) {L2TagArrayWrite} {
+    ss_sendStaleNotification;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(S0_C, NB_AckWB, S0) {L2TagArrayWrite} {
+    ss_sendStaleNotification;
+    pr_popResponseQueue;
+  }
+
+  transition(S1_C, NB_AckWB, S1) {L2TagArrayWrite} {
+    ss_sendStaleNotification;
+    pr_popResponseQueue;
+  }
+
+  transition(S_C, NB_AckWB, S) {L2TagArrayWrite} {
+    ss_sendStaleNotification;
+    pr_popResponseQueue;
+  }
+
+  // Begin Probe Transitions
+
+  transition({Ms, M0, M1, O}, PrbInvData, I) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pd_sendProbeResponseData;
+    i2_invL2;
+    ib_invBothClusters;
+    pp_popProbeQueue;
+  }
+
+  transition({Es, E0, E1, S, I}, PrbInvData, I) {L2TagArrayRead, L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    ib_invBothClusters;
+    ii_invIcache;  // only relevant for S
+    pp_popProbeQueue;
+  }
+
+  transition(S_C, PrbInvData, I_C) {L2TagArrayWrite} {
+    t_allocateTBE;
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbInvData, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms, M0, M1, O, Es, E0, E1, S, I}, PrbInv, I) {L2TagArrayRead, L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2; // nothing will happen in I
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(S_C, PrbInv, I_C) {L2TagArrayWrite} {
+    t_allocateTBE;
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbInv, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms, M0, M1, O}, PrbShrData, O) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({Es, E0, E1, S}, PrbShrData, S) {L2TagArrayRead, L2TagArrayWrite} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition(S_C, PrbShrData) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition({I, I_C}, PrbShrData) {L2TagArrayRead} {
+    pb_sendProbeResponseBackprobe;
+    pp_popProbeQueue;
+  }
+
+  transition({I_M0, I_E0S}, {PrbInv, PrbInvData}) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;  // must invalidate current data (only relevant for I_M0)
+    a0_allocateL1D;  // but make sure there is room for incoming data when it arrives
+    pp_popProbeQueue;
+  }
+
+  transition({I_M1, I_E1S}, {PrbInv, PrbInvData}) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters; // must invalidate current data (only relevant for I_M1)
+    a1_allocateL1D;  // but make sure there is room for incoming data when it arrives
+    pp_popProbeQueue;
+  }
+
+  transition({I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_ES}, {PrbInv, PrbInvData, PrbShrData}) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    a0_allocateL1D;
+    a1_allocateL1D;
+    pp_popProbeQueue;
+  }
+
+  transition({I_M0, I_E0S, I_M1, I_E1S}, PrbShrData) {} {
+    pb_sendProbeResponseBackprobe;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, PrbInvData, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbInvData, I_C) {} {
+    pdt_sendProbeResponseDataFromTBE;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbInv, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, PrbInv, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, PrbShrData, ES_I) {} {
+    ph_sendProbeResponseHit;
+    s_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbShrData, MO_I) {} {
+    pdt_sendProbeResponseDataFromTBE;
+    s_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_S0, PrbInvData, S0_C) {L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pdt_sendProbeResponseDataFromTBE;
+    i2_invL2;
+    a2_allocateL2;
+    d_deallocateTBE;
+    nS_issueRdBlkS;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_S1, PrbInvData, S1_C) {L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pdt_sendProbeResponseDataFromTBE;
+    i2_invL2;
+    a2_allocateL2;
+    d_deallocateTBE;
+    nS_issueRdBlkS;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_S0, PrbInv, S0_C) {L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    a2_allocateL2;
+    d_deallocateTBE;
+    nS_issueRdBlkS;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_S1, PrbInv, S1_C) {L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    a2_allocateL2;
+    d_deallocateTBE;
+    nS_issueRdBlkS;
+    pp_popProbeQueue;
+  }
+
+  transition({MO_S0, MO_S1}, PrbShrData) {} {
+    pdt_sendProbeResponseDataFromTBE;
+    s_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+  transition({S_F0, Es_F0, E0_F, E1_Es}, {PrbInvData, PrbInv}, IF_E0S) {}{
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    // invalidate everything you've got
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    // but make sure you have room for what you need from the fill
+    a0_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({S_F1, Es_F1, E1_F, E0_Es}, {PrbInvData, PrbInv}, IF_E1S) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    // invalidate everything you've got
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    // but make sure you have room for what you need from the fill
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({S_F, Es_F}, {PrbInvData, PrbInv}, IF_ES) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    // invalidate everything you've got
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    // but make sure you have room for what you need from the fill
+    a0_allocateL1D;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition(Si_F0, {PrbInvData, PrbInv}, F_S0) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    nS_issueRdBlkS;
+    pp_popProbeQueue;
+  }
+
+  transition(Si_F1, {PrbInvData, PrbInv}, F_S1) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    nS_issueRdBlkS;
+    pp_popProbeQueue;
+  }
+
+  transition({Es_F0, E0_F, E1_Es}, PrbShrData, S_F0) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition({Es_F1, E1_F, E0_Es}, PrbShrData, S_F1) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition(Es_F, PrbShrData, S_F) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition({S_F0, S_F1, S_F, Si_F0, Si_F1}, PrbShrData) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition(S_M0, PrbInvData, I_M0) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pim_sendProbeResponseInvMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition(O_M0, PrbInvData, I_M0) {L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pdm_sendProbeResponseDataMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S_M0, O_M0}, {PrbInv}, I_M0) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pim_sendProbeResponseInvMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition(S_M1, PrbInvData, I_M1) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pim_sendProbeResponseInvMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition(O_M1, PrbInvData, I_M1) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pdm_sendProbeResponseDataMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S_M1, O_M1}, {PrbInv}, I_M1) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pim_sendProbeResponseInvMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S0, S0_C}, {PrbInvData, PrbInv}) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S1, S1_C}, {PrbInvData, PrbInv}) {}  {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S_M0, S_M1}, PrbShrData) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition({O_M0, O_M1}, PrbShrData) {L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({S0, S1, S0_C, S1_C}, PrbShrData) {} {
+    pb_sendProbeResponseBackprobe;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F0, M0_F, M1_Ms, O_F0}, PrbInvData, IF_E0S) { L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pd_sendProbeResponseData;
+    ib_invBothClusters;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F1, M1_F, M0_Ms, O_F1}, PrbInvData, IF_E1S) {L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pd_sendProbeResponseData;
+    ib_invBothClusters;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F, O_F}, PrbInvData, IF_ES) {L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pd_sendProbeResponseData;
+    ib_invBothClusters;
+    i2_invL2;
+    a0_allocateL1D;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F0, M0_F, M1_Ms, O_F0}, PrbInv, IF_E0S) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F1, M1_F, M0_Ms, O_F1}, PrbInv, IF_E1S) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F, O_F}, PrbInv, IF_ES) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    i2_invL2;
+    a0_allocateL1D;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F0, M0_F, M1_Ms}, PrbShrData, O_F0) {L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F1, M1_F, M0_Ms}, PrbShrData, O_F1) {} {
+  }
+
+  transition({Ms_F}, PrbShrData, O_F) {L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({O_F0, O_F1, O_F}, PrbShrData) {L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  // END TRANSITIONS
+}
+
+
diff --git a/src/mem/protocol/MOESI_AMD_Base-L3cache.sm b/src/mem/protocol/MOESI_AMD_Base-L3cache.sm
new file mode 100644
index 000000000..479cf4e78
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-L3cache.sm
@@ -0,0 +1,1130 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:L3Cache, "L3")
+ : CacheMemory * L3cache;
+   WireBuffer * reqToDir;
+   WireBuffer * respToDir;
+   WireBuffer * l3UnblockToDir;
+   WireBuffer * reqToL3;
+   WireBuffer * probeToL3;
+   WireBuffer * respToL3;
+   Cycles l3_request_latency := 1;
+   Cycles l3_response_latency := 35;
+
+  // To the general response network
+  MessageBuffer * responseFromL3, network="To", virtual_network="2", ordered="false", vnet_type="response";
+
+  // From the general response network
+  MessageBuffer * responseToL3, network="From", virtual_network="2", ordered="false", vnet_type="response";
+
+{
+  // EVENTS
+  enumeration(Event, desc="L3 Events") {
+    // Requests coming from the Cores
+    RdBlk,                  desc="CPU RdBlk event";
+    RdBlkM,                 desc="CPU RdBlkM event";
+    RdBlkS,                 desc="CPU RdBlkS event";
+    CtoD,                   desc="Change to Dirty request";
+    WrVicBlk,               desc="L2 Victim (dirty)";
+    WrVicBlkShared,               desc="L2 Victim (dirty)";
+    ClVicBlk,               desc="L2 Victim (clean)";
+    ClVicBlkShared,               desc="L2 Victim (clean)";
+
+    CPUData,                      desc="WB data from CPU";
+    CPUDataShared,                desc="WB data from CPU, NBReqShared 1";
+    StaleWB,                desc="WB stale; no data";
+
+    L3_Repl,             desc="L3 Replacement";
+
+    // Probes
+    PrbInvData,         desc="Invalidating probe, return dirty data";
+    PrbInv,             desc="Invalidating probe, no need to return data";
+    PrbShrData,         desc="Downgrading probe, return data";
+
+    // Coming from Memory Controller
+    WBAck,                     desc="ack from memory";
+
+    CancelWB,                   desc="Cancel WB from L2";
+  }
+
+  // STATES
+  // Base States:
+  state_declaration(State, desc="L3 State", default="L3Cache_State_I") {
+    M, AccessPermission:Read_Write, desc="Modified";  // No other cache has copy, memory stale
+    O, AccessPermission:Read_Only, desc="Owned";     // Correct most recent copy, others may exist in S
+    E, AccessPermission:Read_Write, desc="Exclusive"; // Correct, most recent, and only copy (and == Memory)
+    S, AccessPermission:Read_Only, desc="Shared";    // Correct, most recent. If no one in O, then == Memory
+    I, AccessPermission:Invalid, desc="Invalid";
+
+    I_M, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data";
+    I_O, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data";
+    I_E, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data";
+    I_S, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data";
+    S_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to M";
+    S_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O";
+    S_E, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to E";
+    S_S, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to S";
+    E_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O";
+    E_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O";
+    E_E, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O";
+    E_S, AccessPermission:Busy, desc="Shared, received WrVicBlk, sent Ack, waiting for Data";
+    O_M, AccessPermission:Busy, desc="...";
+    O_O, AccessPermission:Busy, desc="...";
+    O_E, AccessPermission:Busy, desc="...";
+    O_S, AccessPermission:Busy, desc="...";
+    M_M, AccessPermission:Busy, desc="...";
+    M_O, AccessPermission:Busy, desc="...";
+    M_E, AccessPermission:Busy, desc="...";
+    M_S, AccessPermission:Busy, desc="...";
+    D_I, AccessPermission:Invalid,  desc="drop WB data on the floor when receive";
+    MOD_I, AccessPermission:Busy, desc="drop WB data on the floor, waiting for WBAck from Mem";
+    MO_I, AccessPermission:Busy, desc="M or O, received L3_Repl, waiting for WBAck from Mem";
+    I_I, AccessPermission:Busy, desc="I_MO received L3_Repl";
+    I_CD, AccessPermission:Busy, desc="I_I received WBAck, now just waiting for CPUData";
+    I_C, AccessPermission:Invalid, desc="sent cancel, just waiting to receive mem wb ack so nothing gets confused";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+  // STRUCTURES
+
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff from memory?)";
+    DataBlock DataBlk,          desc="Data for the block";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,     desc="Transient state";
+    DataBlock DataBlk,  desc="data for the block";
+    bool Dirty,         desc="Is the data dirty?";
+    bool Shared,        desc="Victim hit by shared probe";
+    MachineID From,     desc="Waiting for writeback from...";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<L3Cache_TBE>", constructor="m_number_of_TBEs";
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+
+
+  // FUNCTION DEFINITIONS
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+    return static_cast(Entry, "pointer", L3cache.lookup(addr));
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    return getCacheEntry(addr).DataBlk;
+  }
+
+  bool presentOrAvail(Addr addr) {
+    return L3cache.isTagPresent(addr) || L3cache.cacheAvail(addr);
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if (is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+        tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+        cache_entry.CacheState := state;
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes +
+        functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return L3Cache_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return L3Cache_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(L3Cache_State_to_permission(state));
+    }
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    return true;
+  }
+
+
+  // OUT PORTS
+  out_port(requestNetwork_out, CPURequestMsg, reqToDir);
+  out_port(L3Resp_out, ResponseMsg, respToDir);
+  out_port(responseNetwork_out, ResponseMsg, responseFromL3);
+  out_port(unblockNetwork_out, UnblockMsg, l3UnblockToDir);
+
+  // IN PORTS
+  in_port(NBResponse_in, ResponseMsg, respToL3) {
+    if (NBResponse_in.isReady(clockEdge())) {
+      peek(NBResponse_in, ResponseMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+          trigger(Event:WBAck, in_msg.addr, cache_entry, tbe);
+        } else {
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+          error("Error on NBResponse Type");
+        }
+      }
+    }
+  }
+
+  // Response Network
+  in_port(responseNetwork_in, ResponseMsg, responseToL3) {
+    if (responseNetwork_in.isReady(clockEdge())) {
+      peek(responseNetwork_in, ResponseMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:CPUData) {
+          if (in_msg.NbReqShared) {
+            trigger(Event:CPUDataShared, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:CPUData, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
+            trigger(Event:StaleWB, in_msg.addr, cache_entry, tbe);
+        } else {
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+          error("Error on NBResponse Type");
+        }
+      }
+    }
+  }
+
+  // probe network
+  in_port(probeNetwork_in, NBProbeRequestMsg, probeToL3) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, NBProbeRequestMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == ProbeRequestType:PrbInv) {
+          if (in_msg.ReturnData) {
+            trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+          if (in_msg.ReturnData) {
+            trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+          } else {
+            error("Don't think I should get any of these");
+          }
+        }
+      }
+    }
+  }
+
+  // Request Network
+  in_port(requestNetwork_in, CPURequestMsg, reqToL3) {
+    if (requestNetwork_in.isReady(clockEdge())) {
+      peek(requestNetwork_in, CPURequestMsg) {
+        assert(in_msg.Destination.isElement(machineID));
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == CoherenceRequestType:RdBlk) {
+          trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+          trigger(Event:RdBlkS, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+          trigger(Event:RdBlkM, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+          if (presentOrAvail(in_msg.addr)) {
+            if (in_msg.Shared) {
+              trigger(Event:ClVicBlkShared, in_msg.addr, cache_entry, tbe);
+            } else {
+              trigger(Event:ClVicBlk, in_msg.addr, cache_entry, tbe);
+            }
+          } else {
+            Addr victim :=  L3cache.cacheProbe(in_msg.addr);
+            trigger(Event:L3_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+          if (presentOrAvail(in_msg.addr)) {
+            if (in_msg.Shared) {
+              trigger(Event:WrVicBlkShared, in_msg.addr, cache_entry, tbe);
+            } else {
+              trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
+            }
+          } else {
+            Addr victim := L3cache.cacheProbe(in_msg.addr);
+            trigger(Event:L3_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        } else if (in_msg.Type == CoherenceRequestType:WrCancel) {
+          if (is_valid(tbe) && tbe.From == in_msg.Requestor) {
+            trigger(Event:CancelWB, in_msg.addr, cache_entry, tbe);
+          } else {
+            requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+          }
+        }
+      }
+    }
+  }
+
+  // BEGIN ACTIONS
+
+  action(i_invL3, "i", desc="invalidate L3 cache block") {
+    if (is_valid(cache_entry)) {
+        L3cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(rm_sendResponseM, "rm", desc="send Modified response") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, l3_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := cache_entry.Dirty;
+        out_msg.State := CoherenceState:Modified;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(rs_sendResponseS, "rs", desc="send Shared response") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, l3_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := cache_entry.Dirty;
+        out_msg.State := CoherenceState:Shared;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+
+  action(r_requestToMem, "r", desc="Miss in L3, pass on") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(requestNetwork_out, CPURequestMsg, l3_request_latency) {
+        out_msg.addr := address;
+        out_msg.Type := in_msg.Type;
+        out_msg.Requestor := in_msg.Requestor;
+        out_msg.Destination.add(map_Address_to_Directory(address));
+        out_msg.Shared := false; // unneeded for this request
+        out_msg.MessageSize := in_msg.MessageSize;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+    if (is_valid(cache_entry)) {
+      tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs
+      tbe.Dirty := cache_entry.Dirty;
+    }
+    tbe.From := machineID;
+  }
+
+  action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(vd_vicDirty, "vd", desc="Victimize dirty L3 data") {
+    enqueue(requestNetwork_out, CPURequestMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:VicDirty;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+    }
+  }
+
+  action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, l3_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysWBAck;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+      }
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(ph_sendProbeResponseHit, "ph", desc="send probe ack, no data") {
+    enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.Dirty := false;
+      out_msg.Hit := true;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pm_sendProbeResponseMiss, "pm", desc="send probe ack, no data") {
+    enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+    enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.DataBlk := cache_entry.DataBlk;
+      assert(cache_entry.Dirty);
+      out_msg.Dirty := true;
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") {
+    enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.DataBlk := tbe.DataBlk;
+      assert(tbe.Dirty);
+      out_msg.Dirty := true;
+      out_msg.Hit := true;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.State := CoherenceState:NA;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(mc_cancelMemWriteback, "mc", desc="send writeback cancel to memory") {
+    enqueue(requestNetwork_out, CPURequestMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:WrCancel;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+    }
+  }
+
+  action(a_allocateBlock, "a", desc="allocate L3 block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L3cache.allocate(address, new Entry));
+    }
+  }
+
+  action(d_writeData, "d", desc="write data to L3") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.Dirty) {
+        cache_entry.Dirty := in_msg.Dirty;
+      }
+      cache_entry.DataBlk := in_msg.DataBlk;
+      DPRINTF(RubySlicc, "Writing to L3: %s\n", in_msg);
+    }
+  }
+
+  action(rd_copyDataFromRequest, "rd", desc="write data to L3") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := true;
+    }
+  }
+
+  action(f_setFrom, "f", desc="set who WB is expected to come from") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      tbe.From := in_msg.Requestor;
+    }
+  }
+
+  action(rf_resetFrom, "rf", desc="reset From") {
+    tbe.From := machineID;
+  }
+
+  action(wb_data, "wb", desc="write back data") {
+    enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUData;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Shared) {
+        out_msg.NbReqShared := true;
+      } else {
+        out_msg.NbReqShared := false;
+      }
+      out_msg.State := CoherenceState:Shared; // faux info
+      out_msg.MessageSize := MessageSizeType:Writeback_Data;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(wt_writeDataToTBE, "wt", desc="write WB data to TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      tbe.DataBlk := in_msg.DataBlk;
+      tbe.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+    enqueue(unblockNetwork_out, UnblockMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
+    L3cache.setMRU(address);
+  }
+
+  action(p_popRequestQueue, "p", desc="pop request queue") {
+    requestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="pop response queue") {
+    responseNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pn_popNBResponseQueue, "pn", desc="pop NB response queue") {
+    NBResponse_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(zz_recycleRequestQueue, "\z", desc="recycle request queue") {
+    requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+
+  // END ACTIONS
+
+  // BEGIN TRANSITIONS
+
+  // transitions from base
+
+  transition({I, I_C}, {RdBlk, RdBlkS, RdBlkM, CtoD}) {TagArrayRead} {
+    r_requestToMem;
+    p_popRequestQueue;
+  }
+
+  transition(O, RdBlk ) {TagArrayRead, DataArrayRead} {
+    rs_sendResponseS;
+    ut_updateTag;
+    p_popRequestQueue;
+  }
+  transition(M, RdBlk, O) {TagArrayRead, DataArrayRead, TagArrayWrite} {
+    rs_sendResponseS;
+    ut_updateTag;
+    p_popRequestQueue;
+  }
+
+  transition(S, RdBlk) {TagArrayRead, DataArrayRead} {
+    rs_sendResponseS;
+    ut_updateTag;
+    p_popRequestQueue;
+  }
+  transition(E, RdBlk, S) {TagArrayRead, DataArrayRead, TagArrayWrite} {
+    rs_sendResponseS;
+    ut_updateTag;
+    p_popRequestQueue;
+  }
+
+  transition({M, O}, RdBlkS, O) {TagArrayRead, DataArrayRead, TagArrayWrite} {
+    rs_sendResponseS;
+    ut_updateTag;
+    p_popRequestQueue;
+  }
+
+  transition({E, S}, RdBlkS, S) {TagArrayRead, DataArrayRead, TagArrayWrite} {
+    rs_sendResponseS;
+    ut_updateTag;
+    p_popRequestQueue;
+  }
+
+  transition(M, RdBlkM, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    rm_sendResponseM;
+    i_invL3;
+    p_popRequestQueue;
+  }
+
+  transition({O, S}, {RdBlkM, CtoD}) {TagArrayRead} {
+    r_requestToMem;  // can't handle this, just forward
+    p_popRequestQueue;
+  }
+
+  transition(E, RdBlkM, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    rm_sendResponseM;
+    i_invL3;
+    p_popRequestQueue;
+  }
+
+  transition({I}, WrVicBlk, I_M) {TagArrayRead, TagArrayWrite} {
+    a_allocateBlock;
+    t_allocateTBE;
+    f_setFrom;
+//    rd_copyDataFromRequest;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(I_C, {WrVicBlk, WrVicBlkShared, ClVicBlk, ClVicBlkShared}) {} {
+    zz_recycleRequestQueue;
+  }
+
+  transition({I}, WrVicBlkShared, I_O) {TagArrayRead, TagArrayWrite} {
+    a_allocateBlock;
+    t_allocateTBE;
+    f_setFrom;
+//    rd_copyDataFromRequest;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(S, WrVicBlkShared, S_O) {TagArrayRead, TagArrayWrite} {
+//    rd_copyDataFromRequest;
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(S, WrVicBlk, S_M) {TagArrayRead, TagArrayWrite} { // should be technically not possible, but assume the data comes back with shared bit flipped
+//    rd_copyDataFromRequest;
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(E, WrVicBlk, E_M) {TagArrayRead, TagArrayWrite}  {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(E, WrVicBlkShared, E_O) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(O, WrVicBlk, O_M) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(O, WrVicBlkShared, O_O) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(M, WrVicBlk, M_M) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(M, WrVicBlkShared, M_O) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition({I}, ClVicBlk, I_E) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    a_allocateBlock;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition({I}, ClVicBlkShared, I_S) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    a_allocateBlock;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(S, ClVicBlk, S_E) {TagArrayRead, TagArrayWrite} { // technically impossible, assume data comes back with shared bit flipped
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(S, ClVicBlkShared, S_S) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(E, ClVicBlk, E_E) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(E, ClVicBlkShared, E_S) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(O, ClVicBlk, O_E) {TagArrayRead, TagArrayWrite} { // technically impossible, but assume data comes back with shared bit flipped
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(O, ClVicBlkShared, O_S) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(M, ClVicBlk, M_E) {TagArrayRead, TagArrayWrite}  {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(M, ClVicBlkShared, M_S) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition({MO_I}, {RdBlk, RdBlkS, RdBlkM, CtoD}) {} {
+    r_requestToMem;
+    p_popRequestQueue;
+  }
+
+  transition(MO_I, {WrVicBlkShared, WrVicBlk, ClVicBlk, ClVicBlkShared}, MOD_I) {TagArrayWrite} {
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(I_M, CPUData, M) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M, CPUDataShared, O) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_O, {CPUData, CPUDataShared}, O) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E, CPUData, E) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E, CPUDataShared, S) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_S, {CPUData, CPUDataShared}, S) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(S_M, CPUDataShared, O) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(S_O, {CPUData, CPUDataShared}, O) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(S_E, CPUDataShared, S) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(S_S, {CPUData, CPUDataShared}, S) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(O_E, CPUDataShared, O) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(O_S, {CPUData, CPUDataShared}, O) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition({D_I}, {CPUData, CPUDataShared}, I) {TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(MOD_I, {CPUData, CPUDataShared}, MO_I) {TagArrayWrite} {
+    uu_sendUnblock;
+    rf_resetFrom;
+    pr_popResponseQueue;
+  }
+
+  transition(I_I, {CPUData, CPUDataShared}, MO_I) {TagArrayWrite, DataArrayRead} {
+    uu_sendUnblock;
+    wt_writeDataToTBE;
+    rf_resetFrom;
+    pr_popResponseQueue;
+  }
+
+  transition(I_CD, {CPUData, CPUDataShared}, I) {DataArrayRead, TagArrayWrite} {
+    uu_sendUnblock;
+    wt_writeDataToTBE;
+    wb_data;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition({M, O}, L3_Repl, MO_I) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    vd_vicDirty;
+    i_invL3;
+  }
+
+  transition({E, S,}, L3_Repl, I) {TagArrayRead, TagArrayWrite} {
+    i_invL3;
+  }
+
+  transition({I_M, I_O, S_M, S_O, E_M, E_O}, L3_Repl) {} {
+    zz_recycleRequestQueue;
+  }
+
+  transition({O_M, O_O, O_E, O_S, M_M, M_O, M_E, M_S}, L3_Repl) {} {
+    zz_recycleRequestQueue;
+  }
+
+  transition({I_E, I_S, S_E, S_S, E_E, E_S}, L3_Repl) {} {
+    zz_recycleRequestQueue;
+  }
+
+  transition({M, O}, PrbInvData, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    pd_sendProbeResponseData;
+    i_invL3;
+    pp_popProbeQueue;
+  }
+
+  transition({E, S, I}, PrbInvData, I) {TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    i_invL3;  // nothing will happen in I
+    pp_popProbeQueue;
+  }
+
+  transition({M, O, E, S, I}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    i_invL3; // nothing will happen in I
+    pp_popProbeQueue;
+  }
+
+  transition({M, O}, PrbShrData, O) {TagArrayRead, DataArrayRead, TagArrayWrite} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({E, S}, PrbShrData, S) {TagArrayRead, TagArrayWrite} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition(I, PrbShrData) {TagArrayRead} {
+    pm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbInvData, I_C) {TagArrayWrite, DataArrayRead} {
+    pdt_sendProbeResponseDataFromTBE;
+    mc_cancelMemWriteback;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbInv, I_C) {TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    mc_cancelMemWriteback;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbShrData) {DataArrayRead} {
+    pdt_sendProbeResponseDataFromTBE;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, {PrbInvData, PrbInv}) {} {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbShrData) {} {
+    pm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition(I_I, {WBAck}, I_CD) {TagArrayWrite} {
+    pn_popNBResponseQueue;
+  }
+
+  transition(MOD_I, WBAck, D_I) {DataArrayRead} {
+    wb_data;
+    pn_popNBResponseQueue;
+  }
+
+  transition(MO_I, WBAck, I) {DataArrayRead, TagArrayWrite} {
+    wb_data;
+    dt_deallocateTBE;
+    pn_popNBResponseQueue;
+  }
+
+  transition(I_C, {WBAck}, I) {TagArrayWrite} {
+    dt_deallocateTBE;
+    pn_popNBResponseQueue;
+  }
+
+  transition({I_M, I_O, I_E, I_S}, CancelWB, I) {TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    i_invL3;
+    p_popRequestQueue;
+  }
+
+  transition({S_S, S_O, S_M, S_E}, CancelWB, S) {TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition({E_M, E_O, E_E, E_S}, CancelWB, E) {TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition({O_M, O_O, O_E, O_S}, CancelWB, O) {TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition({M_M, M_O, M_E, M_S}, CancelWB, M) {TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition(D_I, CancelWB, I) {TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition(MOD_I, CancelWB, MO_I) {TagArrayWrite} {
+    uu_sendUnblock;
+    rf_resetFrom;
+    p_popRequestQueue;
+  }
+
+  transition(I_I, CancelWB, I_C) {TagArrayWrite} {
+    uu_sendUnblock;
+    rf_resetFrom;
+    mc_cancelMemWriteback;
+    p_popRequestQueue;
+  }
+
+  transition(I_CD, CancelWB, I) {TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    mc_cancelMemWriteback;
+    p_popRequestQueue;
+  }
+
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base-Region-CorePair.sm b/src/mem/protocol/MOESI_AMD_Base-Region-CorePair.sm
new file mode 100644
index 000000000..fd84447a2
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-Region-CorePair.sm
@@ -0,0 +1,3009 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:CorePair, "CP-like Core Coherence")
+ : Sequencer * sequencer;
+   Sequencer * sequencer1;
+   CacheMemory * L1Icache;
+   CacheMemory * L1D0cache;
+   CacheMemory * L1D1cache;
+   CacheMemory * L2cache;
+   int regionBufferNum;
+   bool send_evictions := "False";
+   Cycles issue_latency := 5;
+   Cycles l2_hit_latency := 18;
+
+  // BEGIN Core Buffers
+
+  // To the Network
+  MessageBuffer * requestFromCore, network="To", virtual_network="0", ordered="true", vnet_type="request";
+  MessageBuffer * responseFromCore, network="To", virtual_network="2", ordered="false", vnet_type="response";
+  MessageBuffer * unblockFromCore, network="To", virtual_network="4", ordered="false", vnet_type="unblock";
+
+  // From the Network
+  MessageBuffer * probeToCore, network="From", virtual_network="0", ordered="false", vnet_type="request";
+  MessageBuffer * responseToCore, network="From", virtual_network="2", ordered="false", vnet_type="response";
+
+  MessageBuffer * mandatoryQueue, ordered="false";
+  MessageBuffer * triggerQueue, ordered="true";
+
+  // END Core Buffers
+
+{
+  // BEGIN STATES
+  state_declaration(State, desc="Cache states", default="CorePair_State_I") {
+
+    I, AccessPermission:Invalid, desc="Invalid";
+    S, AccessPermission:Read_Only, desc="Shared";
+    E0, AccessPermission:Read_Write, desc="Exclusive with Cluster 0 ownership";
+    E1, AccessPermission:Read_Write, desc="Exclusive with Cluster 1 ownership";
+    Es, AccessPermission:Read_Write, desc="Exclusive in core";
+    O, AccessPermission:Read_Only, desc="Owner state in core, both clusters and other cores may be sharing line";
+    Ms, AccessPermission:Read_Write, desc="Modified in core, both clusters may be sharing line";
+    M0, AccessPermission:Read_Write, desc="Modified with cluster ownership";
+    M1, AccessPermission:Read_Write, desc="Modified with cluster ownership";
+
+    // Transient States
+    I_M0, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet";
+    I_M1, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet";
+    I_M0M1, AccessPermission:Busy, desc="Was in I_M0, got a store request from other cluster as well";
+    I_M1M0, AccessPermission:Busy, desc="Was in I_M1, got a store request from other cluster as well";
+    I_M0Ms, AccessPermission:Busy, desc="Was in I_M0, got a load request from other cluster as well";
+    I_M1Ms, AccessPermission:Busy, desc="Was in I_M1, got a load request from other cluster as well";
+    I_E0S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet";
+    I_E1S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet";
+    I_ES, AccessPermission:Busy, desc="S_F got hit by invalidating probe, RdBlk response needs to go to both clusters";
+
+    IF_E0S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E0S but expecting a L2_to_L1D0 trigger, just drop when receive";
+    IF_E1S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E1S but expecting a L2_to_L1D1 trigger, just drop when receive";
+    IF_ES, AccessPermission:Busy, desc="same, but waiting for two fills";
+    IF0_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one";
+    IF1_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one";
+    F_S0, AccessPermission:Busy, desc="same, but going to S0 when trigger received";
+    F_S1, AccessPermission:Busy, desc="same, but going to S1 when trigger received";
+
+    ES_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for clean writeback ack";
+    MO_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for dirty writeback ack";
+    MO_S0, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS";
+    MO_S1, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS";
+    S_F0, AccessPermission:Read_Only,  desc="Shared, filling L1";
+    S_F1, AccessPermission:Read_Only,  desc="Shared, filling L1";
+    S_F, AccessPermission:Read_Only,   desc="Shared, filling L1";
+    O_F0, AccessPermission:Read_Only,  desc="Owned, filling L1";
+    O_F1, AccessPermission:Read_Only,  desc="Owned, filling L1";
+    O_F,  AccessPermission:Read_Only,  desc="Owned, filling L1";
+    Si_F0, AccessPermission:Read_Only, desc="Shared, filling icache";
+    Si_F1, AccessPermission:Read_Only, desc="Shared, filling icache";
+    S_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    S_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    O_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    O_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    S0, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 0, waiting for response";
+    S1, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 1, waiting for response";
+
+    Es_F0, AccessPermission:Read_Write, desc="Es, Cluster read, filling";
+    Es_F1, AccessPermission:Read_Write, desc="Es, Cluster read, filling";
+    Es_F, AccessPermission:Read_Write,  desc="Es, other cluster read, filling";
+    E0_F, AccessPermission:Read_Write, desc="E0, cluster read, filling";
+    E1_F, AccessPermission:Read_Write, desc="...";
+    E0_Es, AccessPermission:Read_Write, desc="...";
+    E1_Es, AccessPermission:Read_Write, desc="...";
+    Ms_F0, AccessPermission:Read_Write, desc="...";
+    Ms_F1, AccessPermission:Read_Write, desc="...";
+    Ms_F, AccessPermission:Read_Write,  desc="...";
+    M0_F, AccessPermission:Read_Write, desc="...";
+    M0_Ms, AccessPermission:Read_Write, desc="...";
+    M1_F, AccessPermission:Read_Write, desc="...";
+    M1_Ms, AccessPermission:Read_Write, desc="...";
+
+    I_C, AccessPermission:Invalid, desc="Invalid, but waiting for WBAck from NB from canceled writeback";
+    S0_C, AccessPermission:Busy, desc="MO_S0 hit by invalidating probe, waiting for WBAck form NB for canceled WB";
+    S1_C, AccessPermission:Busy, desc="MO_S1 hit by invalidating probe, waiting for WBAck form NB for canceled WB";
+    S_C, AccessPermission:Busy, desc="S*_C got NB_AckS, still waiting for WBAck";
+
+  } // END STATES
+
+  // BEGIN EVENTS
+  enumeration(Event, desc="CP Events") {
+    // CP Initiated events
+    C0_Load_L1miss,            desc="Cluster 0 load, L1 missed";
+    C0_Load_L1hit,             desc="Cluster 0 load, L1 hit";
+    C1_Load_L1miss,            desc="Cluster 1 load L1 missed";
+    C1_Load_L1hit,             desc="Cluster 1 load L1 hit";
+    Ifetch0_L1hit,             desc="Instruction fetch, hit in the L1";
+    Ifetch1_L1hit,             desc="Instruction fetch, hit in the L1";
+    Ifetch0_L1miss,            desc="Instruction fetch, missed in the L1";
+    Ifetch1_L1miss,            desc="Instruction fetch, missed in the L1";
+    C0_Store_L1miss,           desc="Cluster 0 store missed in L1";
+    C0_Store_L1hit,            desc="Cluster 0 store hit in L1";
+    C1_Store_L1miss,           desc="Cluster 1 store missed in L1";
+    C1_Store_L1hit,            desc="Cluster 1 store hit in L1";
+    // NB Initiated events
+    NB_AckS,             desc="NB Ack to Core Request";
+    NB_AckM,             desc="NB Ack to Core Request";
+    NB_AckE,             desc="NB Ack to Core Request";
+
+    NB_AckWB,            desc="NB Ack for writeback";
+
+    // Memory System initiatied events
+    L1I_Repl,           desc="Replace address from L1I"; // Presumed clean
+    L1D0_Repl,           desc="Replace address from L1D0"; // Presumed clean
+    L1D1_Repl,           desc="Replace address from L1D1"; // Presumed clean
+    L2_Repl,            desc="Replace address from L2";
+
+    L2_to_L1D0,           desc="L1 fill from L2";
+    L2_to_L1D1,           desc="L1 fill from L2";
+    L2_to_L1I,           desc="L1 fill from L2";
+
+    // Probe Events
+    PrbInvData,         desc="probe, return O or M data";
+    PrbInvDataDemand,     desc="probe, return O or M data. Demand request";
+    PrbInv,             desc="probe, no need for data";
+    PrbShrData,         desc="probe downgrade, return O or M data";
+    PrbShrDataDemand,     desc="probe downgrade, return O or M data. Demand request";
+    ForceRepl,          desc="probe from r-buf. Act as though a repl";
+    ForceDowngrade,     desc="probe from r-buf. Act as though a repl";
+
+  }  // END EVENTS
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    L1D0DataArrayRead,    desc="Read the data array";
+    L1D0DataArrayWrite,   desc="Write the data array";
+    L1D0TagArrayRead,     desc="Read the data array";
+    L1D0TagArrayWrite,    desc="Write the data array";
+    L1D1DataArrayRead,    desc="Read the data array";
+    L1D1DataArrayWrite,   desc="Write the data array";
+    L1D1TagArrayRead,     desc="Read the data array";
+    L1D1TagArrayWrite,    desc="Write the data array";
+    L1IDataArrayRead,     desc="Read the data array";
+    L1IDataArrayWrite,    desc="Write the data array";
+    L1ITagArrayRead,      desc="Read the data array";
+    L1ITagArrayWrite,     desc="Write the data array";
+    L2DataArrayRead,      desc="Read the data array";
+    L2DataArrayWrite,     desc="Write the data array";
+    L2TagArrayRead,       desc="Read the data array";
+    L2TagArrayWrite,      desc="Write the data array";
+  }
+
+
+  // BEGIN STRUCTURE DEFINITIONS
+
+
+  // Cache Entry
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff than memory)?";
+    DataBlock DataBlk,          desc="data for the block";
+    bool FromL2, default="false", desc="block just moved from L2";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,             desc="Transient state";
+    DataBlock DataBlk,       desc="data for the block, required for concurrent writebacks";
+    bool Dirty,              desc="Is the data dirty (different than memory)?";
+    int NumPendingMsgs,      desc="Number of acks/data messages that this processor is waiting for";
+    bool Shared,             desc="Victim hit by shared probe";
+    bool AckNeeded,          desc="True if need to ack r-dir";
+   }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<CorePair_TBE>", constructor="m_number_of_TBEs";
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  // END STRUCTURE DEFINITIONS
+
+  // BEGIN INTERNAL FUNCTIONS
+
+  MachineID getPeer(MachineID mach) {
+    return createMachineID(MachineType:RegionBuffer, intToID(regionBufferNum));
+  }
+
+  bool addressInCore(Addr addr) {
+    return (L2cache.isTagPresent(addr) || L1Icache.isTagPresent(addr) || L1D0cache.isTagPresent(addr) || L1D1cache.isTagPresent(addr));
+  }
+
+  Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+    Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address));
+    return L2cache_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    } else {
+      return getCacheEntry(addr).DataBlk;
+    }
+  }
+
+  Entry getL1CacheEntry(Addr addr, int cluster), return_by_pointer="yes" {
+    if (cluster == 0) {
+      Entry L1D0_entry := static_cast(Entry, "pointer", L1D0cache.lookup(addr));
+      return L1D0_entry;
+    } else {
+      Entry L1D1_entry := static_cast(Entry, "pointer", L1D1cache.lookup(addr));
+      return L1D1_entry;
+    }
+  }
+
+  Entry getICacheEntry(Addr addr), return_by_pointer="yes" {
+    Entry c_entry := static_cast(Entry, "pointer", L1Icache.lookup(addr));
+    return c_entry;
+  }
+
+  bool presentOrAvail2(Addr addr) {
+    return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr);
+  }
+
+  bool presentOrAvailI(Addr addr) {
+    return L1Icache.isTagPresent(addr) || L1Icache.cacheAvail(addr);
+  }
+
+  bool presentOrAvailD0(Addr addr) {
+    return L1D0cache.isTagPresent(addr) || L1D0cache.cacheAvail(addr);
+  }
+
+  bool presentOrAvailD1(Addr addr) {
+    return L1D1cache.isTagPresent(addr) || L1D1cache.cacheAvail(addr);
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if(is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+      tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+      cache_entry.CacheState := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return CorePair_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return CorePair_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  bool isValid(Addr addr) {
+      AccessPermission perm := getAccessPermission(addr);
+      if (perm == AccessPermission:NotPresent ||
+          perm == AccessPermission:Invalid ||
+          perm == AccessPermission:Busy) {
+          return false;
+      } else {
+          return true;
+      }
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(CorePair_State_to_permission(state));
+    }
+  }
+
+  MachineType testAndClearLocalHit(Entry cache_entry) {
+    assert(is_valid(cache_entry));
+    if (cache_entry.FromL2) {
+      cache_entry.FromL2 := false;
+      return MachineType:L2Cache;
+    } else {
+      return MachineType:L1Cache;
+    }
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L1D0DataArrayRead) {
+      L1D0cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L1D0DataArrayWrite) {
+      L1D0cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L1D0TagArrayRead) {
+      L1D0cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L1D0TagArrayWrite) {
+      L1D0cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    } else if (request_type == RequestType:L1D1DataArrayRead) {
+      L1D1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L1D1DataArrayWrite) {
+      L1D1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L1D1TagArrayRead) {
+      L1D1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L1D1TagArrayWrite) {
+      L1D1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    } else if (request_type == RequestType:L1IDataArrayRead) {
+      L1Icache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L1IDataArrayWrite) {
+      L1Icache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L1ITagArrayRead) {
+      L1Icache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L1ITagArrayWrite) {
+      L1Icache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    } else if (request_type == RequestType:L2DataArrayRead) {
+      L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L2DataArrayWrite) {
+      L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L2TagArrayRead) {
+      L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L2TagArrayWrite) {
+      L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L2DataArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L2DataArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L2TagArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L2TagArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if  (request_type == RequestType:L1D0DataArrayRead) {
+      return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if  (request_type == RequestType:L1D0DataArrayWrite) {
+      return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1D0TagArrayRead) {
+      return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1D0TagArrayWrite) {
+      return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1D1DataArrayRead) {
+      return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1D1DataArrayWrite) {
+      return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1D1TagArrayRead) {
+      return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1D1TagArrayWrite) {
+      return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1IDataArrayRead) {
+      return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1IDataArrayWrite) {
+      return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1ITagArrayRead) {
+      return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1ITagArrayWrite) {
+      return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      return true;
+    }
+  }
+
+  // END INTERNAL FUNCTIONS
+
+  // ** OUT_PORTS **
+
+  out_port(requestNetwork_out, CPURequestMsg, requestFromCore);
+  out_port(responseNetwork_out, ResponseMsg, responseFromCore);
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+  out_port(unblockNetwork_out, UnblockMsg, unblockFromCore);
+
+  // ** IN_PORTS **
+
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, block_on="addr") {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == TriggerType:L2_to_L1) {
+          if (in_msg.Dest == CacheId:L1I) {
+            trigger(Event:L2_to_L1I, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Dest == CacheId:L1D0) {
+            trigger(Event:L2_to_L1D0, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Dest == CacheId:L1D1) {
+            trigger(Event:L2_to_L1D1, in_msg.addr, cache_entry, tbe);
+          } else {
+            error("unexpected trigger dest");
+          }
+        }
+      }
+    }
+  }
+
+
+  in_port(probeNetwork_in, NBProbeRequestMsg, probeToCore) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, NBProbeRequestMsg, block_on="addr") {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == ProbeRequestType:PrbInv) {
+          if (in_msg.DemandRequest) {
+            trigger(Event:PrbInvDataDemand, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.ReturnData) {
+            trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+          if (in_msg.DemandRequest) {
+               trigger(Event:PrbShrDataDemand, in_msg.addr, cache_entry, tbe);
+          } else {
+               assert(in_msg.ReturnData);
+               trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == ProbeRequestType:PrbRepl) {
+          trigger(Event:ForceRepl, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == ProbeRequestType:PrbRegDowngrade) {
+          trigger(Event:ForceDowngrade, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unknown probe request");
+        }
+      }
+    }
+  }
+
+
+  // ResponseNetwork
+  in_port(responseToCore_in, ResponseMsg, responseToCore) {
+    if (responseToCore_in.isReady(clockEdge())) {
+      peek(responseToCore_in, ResponseMsg, block_on="addr") {
+
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+          if (in_msg.State == CoherenceState:Modified) {
+              trigger(Event:NB_AckM, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.State == CoherenceState:Shared) {
+            trigger(Event:NB_AckS, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.State == CoherenceState:Exclusive) {
+            trigger(Event:NB_AckE, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+          trigger(Event:NB_AckWB, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+
+  // Nothing from the Unblock Network
+
+  // Mandatory Queue
+  in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+    if (mandatoryQueue_in.isReady(clockEdge())) {
+      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+
+        Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+        TBE tbe := TBEs.lookup(in_msg.LineAddress);
+
+        if (in_msg.Type == RubyRequestType:IFETCH) {
+          // FETCH ACCESS
+
+          if (L1Icache.isTagPresent(in_msg.LineAddress)) {
+            if (mod(in_msg.contextId, 2) == 0) {
+              trigger(Event:Ifetch0_L1hit, in_msg.LineAddress, cache_entry, tbe);
+            } else {
+              trigger(Event:Ifetch1_L1hit, in_msg.LineAddress, cache_entry, tbe);
+            }
+          } else {
+            if (presentOrAvail2(in_msg.LineAddress)) {
+              if (presentOrAvailI(in_msg.LineAddress)) {
+                if (mod(in_msg.contextId, 2) == 0) {
+                  trigger(Event:Ifetch0_L1miss, in_msg.LineAddress, cache_entry,
+                          tbe);
+                } else {
+                  trigger(Event:Ifetch1_L1miss, in_msg.LineAddress, cache_entry,
+                          tbe);
+                }
+              } else {
+                Addr victim := L1Icache.cacheProbe(in_msg.LineAddress);
+                trigger(Event:L1I_Repl, victim,
+                        getCacheEntry(victim), TBEs.lookup(victim));
+              }
+            } else { // Not present or avail in L2
+              Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+              DPRINTF(RubySlicc, "Victim for %s L2_Repl(0) is %s\n", in_msg.LineAddress, victim);
+              trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+                      TBEs.lookup(victim));
+            }
+          }
+        } else {
+          // DATA ACCESS
+          if (mod(in_msg.contextId, 2) == 1) {
+            if (L1D1cache.isTagPresent(in_msg.LineAddress)) {
+              if (in_msg.Type == RubyRequestType:LD) {
+                trigger(Event:C1_Load_L1hit, in_msg.LineAddress, cache_entry,
+                        tbe);
+              } else {
+                // Stores must write through, make sure L2 avail.
+                if (presentOrAvail2(in_msg.LineAddress)) {
+                  trigger(Event:C1_Store_L1hit, in_msg.LineAddress, cache_entry,
+                          tbe);
+                } else {
+                  Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+              DPRINTF(RubySlicc, "Victim for %s L2_Repl(1) is %s\n", in_msg.LineAddress, victim);
+                  trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+                          TBEs.lookup(victim));
+                }
+              }
+            } else {
+              if (presentOrAvail2(in_msg.LineAddress)) {
+                if (presentOrAvailD1(in_msg.LineAddress)) {
+                  if (in_msg.Type == RubyRequestType:LD) {
+                    trigger(Event:C1_Load_L1miss, in_msg.LineAddress,
+                            cache_entry, tbe);
+                  } else {
+                    trigger(Event:C1_Store_L1miss, in_msg.LineAddress,
+                            cache_entry, tbe);
+                  }
+                } else {
+                  Addr victim := L1D1cache.cacheProbe(in_msg.LineAddress);
+              DPRINTF(RubySlicc, "Victim for %s L1D1_Repl is %s\n", in_msg.LineAddress, victim);
+                  trigger(Event:L1D1_Repl, victim,
+                          getCacheEntry(victim), TBEs.lookup(victim));
+                }
+              } else { // not present or avail in L2
+                Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+              DPRINTF(RubySlicc, "Victim for %s L2_Repl(2) is %s\n", in_msg.LineAddress, victim);
+                trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+              }
+            }
+          } else {
+            Entry L1D0cache_entry := getL1CacheEntry(in_msg.LineAddress, 0);
+            if (is_valid(L1D0cache_entry)) {
+              if (in_msg.Type == RubyRequestType:LD) {
+                trigger(Event:C0_Load_L1hit, in_msg.LineAddress, cache_entry,
+                    tbe);
+              } else {
+                if (presentOrAvail2(in_msg.LineAddress)) {
+                  trigger(Event:C0_Store_L1hit, in_msg.LineAddress, cache_entry,
+                      tbe);
+                } else {
+                  Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+              DPRINTF(RubySlicc, "Victim for %s L2_Repl(3) is %s\n", in_msg.LineAddress, victim);
+                  trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+                      TBEs.lookup(victim));
+                }
+              }
+            } else {
+              if (presentOrAvail2(in_msg.LineAddress)) {
+                if (presentOrAvailD0(in_msg.LineAddress)) {
+                  if (in_msg.Type == RubyRequestType:LD) {
+                    trigger(Event:C0_Load_L1miss, in_msg.LineAddress,
+                        cache_entry, tbe);
+                  } else {
+                    trigger(Event:C0_Store_L1miss, in_msg.LineAddress,
+                            cache_entry, tbe);
+                  }
+                } else {
+                  Addr victim := L1D0cache.cacheProbe(in_msg.LineAddress);
+              DPRINTF(RubySlicc, "Victim for %s L1D0_Repl is %s\n", in_msg.LineAddress, victim);
+                  trigger(Event:L1D0_Repl, victim, getCacheEntry(victim),
+                          TBEs.lookup(victim));
+                }
+              } else {
+                Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+              DPRINTF(RubySlicc, "Victim for %s L2_Repl(4) is %s\n", in_msg.LineAddress, victim);
+                trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+                        TBEs.lookup(victim));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+
+  // ACTIONS
+  action(ii_invIcache, "ii", desc="invalidate iCache") {
+    if (L1Icache.isTagPresent(address)) {
+      L1Icache.deallocate(address);
+    }
+  }
+
+  action(i0_invCluster, "i0", desc="invalidate cluster 0") {
+    if (L1D0cache.isTagPresent(address)) {
+      L1D0cache.deallocate(address);
+    }
+  }
+
+  action(i1_invCluster, "i1", desc="invalidate cluster 1") {
+    if (L1D1cache.isTagPresent(address)) {
+      L1D1cache.deallocate(address);
+    }
+  }
+
+  action(ib_invBothClusters, "ib", desc="invalidate both clusters") {
+    if (L1D0cache.isTagPresent(address)) {
+      L1D0cache.deallocate(address);
+    }
+    if (L1D1cache.isTagPresent(address)) {
+      L1D1cache.deallocate(address);
+    }
+  }
+
+  action(i2_invL2, "i2", desc="invalidate L2") {
+    if(is_valid(cache_entry)) {
+        L2cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(n_issueRdBlk, "n", desc="Issue RdBlk") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlk;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkM;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(nMs_issueRdBlkMSinked, "nMs", desc="Issue RdBlkM with CtoDSinked") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkM;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.CtoDSinked := true;
+    }
+  }
+
+  action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkS;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(nSs_issueRdBlkSSinked, "nSs", desc="Issue RdBlkS with CtoDSinked") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkS;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.CtoDSinked := true;
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+    }
+  }
+
+  action(vd_victim, "vd", desc="Victimize M/O L2 Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      assert(is_valid(cache_entry));
+      out_msg.DataBlk := cache_entry.DataBlk;
+      assert(cache_entry.Dirty);
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicDirty;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:O) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+    }
+  }
+
+  action(vc_victim, "vc", desc="Victimize E/S L2 Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicClean;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:S) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+    }
+  }
+
+  // Could send these two directly to dir if we made a new out network on channel 0
+  action(vdf_victimForce, "vdf", desc="Victimize M/O L2 Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      assert(is_valid(cache_entry));
+      out_msg.DataBlk := cache_entry.DataBlk;
+      assert(cache_entry.Dirty);
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicDirty;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:O) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+      out_msg.Private := true;
+    }
+  }
+
+  action(vcf_victimForce, "vcf", desc="Victimize E/S L2 Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicClean;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:S) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+      out_msg.Private := true;
+    }
+  }
+
+  action(a0_allocateL1D, "a0", desc="Allocate L1D0 Block") {
+    if (L1D0cache.isTagPresent(address) == false) {
+      L1D0cache.allocateVoid(address, new Entry);
+    }
+  }
+
+  action(a1_allocateL1D, "a1", desc="Allocate L1D1 Block") {
+    if (L1D1cache.isTagPresent(address) == false) {
+      L1D1cache.allocateVoid(address, new Entry);
+    }
+  }
+
+  action(ai_allocateL1I, "ai", desc="Allocate L1I Block") {
+    if (L1Icache.isTagPresent(address) == false) {
+      L1Icache.allocateVoid(address, new Entry);
+    }
+  }
+
+  action(a2_allocateL2, "a2", desc="Allocate L2 Block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L2cache.allocate(address, new Entry));
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    assert(is_valid(cache_entry));
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+    tbe.DataBlk := cache_entry.DataBlk;  // Data only used for WBs
+    tbe.Dirty := cache_entry.Dirty;
+    tbe.Shared := false;
+  }
+
+  action(d_deallocateTBE, "d", desc="Deallocate TBE") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+    mandatoryQueue_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+    responseToCore_in.dequeue(clockEdge());
+  }
+
+  action(pt_popTriggerQueue, "pt", desc="Pop Trigger Queue") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(il0_loadDone, "il0", desc="Cluster 0 i load done") {
+    Entry entry := getICacheEntry(address);
+    Entry l2entry := getCacheEntry(address); // Used for functional accesses
+    assert(is_valid(entry));
+    // L2 supplies data (functional accesses only look in L2, ok because L1
+    //                   writes through to L2)
+    sequencer.readCallback(address,
+                           l2entry.DataBlk,
+                           true,
+                           testAndClearLocalHit(entry));
+  }
+
+  action(il1_loadDone, "il1", desc="Cluster 1 i load done") {
+    Entry entry := getICacheEntry(address);
+    Entry l2entry := getCacheEntry(address); // Used for functional accesses
+    assert(is_valid(entry));
+    // L2 supplies data (functional accesses only look in L2, ok because L1
+    //                   writes through to L2)
+    sequencer1.readCallback(address,
+                            l2entry.DataBlk,
+                            true,
+                            testAndClearLocalHit(entry));
+  }
+
+  action(l0_loadDone, "l0", desc="Cluster 0 load done") {
+    Entry entry := getL1CacheEntry(address, 0);
+    Entry l2entry := getCacheEntry(address); // Used for functional accesses
+    assert(is_valid(entry));
+    // L2 supplies data (functional accesses only look in L2, ok because L1
+    //                   writes through to L2)
+    sequencer.readCallback(address,
+                           l2entry.DataBlk,
+                           true,
+                           testAndClearLocalHit(entry));
+  }
+
+  action(l1_loadDone, "l1", desc="Cluster 1 load done") {
+    Entry entry := getL1CacheEntry(address, 1);
+    Entry l2entry := getCacheEntry(address); // Used for functional accesses
+    assert(is_valid(entry));
+    // L2 supplies data (functional accesses only look in L2, ok because L1
+    //                   writes through to L2)
+    sequencer1.readCallback(address,
+                            l2entry.DataBlk,
+                            true,
+                            testAndClearLocalHit(entry));
+  }
+
+  action(xl0_loadDone, "xl0", desc="Cluster 0 load done") {
+    peek(responseToCore_in, ResponseMsg) {
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+              (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      Entry l2entry := getCacheEntry(address); // Used for functional accesses
+      DPRINTF(ProtocolTrace, "CP Load Done 0 -- address %s, data: %s\n",
+              address, l2entry.DataBlk);
+      // L2 supplies data (functional accesses only look in L2, ok because L1
+      //                   writes through to L2)
+      assert(is_valid(l2entry));
+      sequencer.readCallback(address,
+                             l2entry.DataBlk,
+                             false,
+                             machineIDToMachineType(in_msg.Sender),
+                             in_msg.InitialRequestTime,
+                             in_msg.ForwardRequestTime,
+                             in_msg.ProbeRequestStartTime);
+    }
+  }
+
+  action(xl1_loadDone, "xl1", desc="Cluster 1 load done") {
+   peek(responseToCore_in, ResponseMsg) {
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+              (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      Entry l2entry := getCacheEntry(address); // Used for functional accesses
+      // L2 supplies data (functional accesses only look in L2, ok because L1
+      //                   writes through to L2)
+      assert(is_valid(l2entry));
+      sequencer1.readCallback(address,
+                              l2entry.DataBlk,
+                              false,
+                              machineIDToMachineType(in_msg.Sender),
+                              in_msg.InitialRequestTime,
+                              in_msg.ForwardRequestTime,
+                              in_msg.ProbeRequestStartTime);
+   }
+  }
+
+  action(xi0_loadDone, "xi0", desc="Cluster 0 i-load done") {
+    peek(responseToCore_in, ResponseMsg) {
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+              (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      Entry l2entry := getCacheEntry(address); // Used for functional accesses
+      // L2 supplies data (functional accesses only look in L2, ok because L1
+      //                   writes through to L2)
+      assert(is_valid(l2entry));
+      sequencer.readCallback(address,
+                             l2entry.DataBlk,
+                             false,
+                             machineIDToMachineType(in_msg.Sender),
+                             in_msg.InitialRequestTime,
+                             in_msg.ForwardRequestTime,
+                             in_msg.ProbeRequestStartTime);
+    }
+  }
+
+  action(xi1_loadDone, "xi1", desc="Cluster 1 i-load done") {
+    peek(responseToCore_in, ResponseMsg) {
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+              (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      Entry l2entry := getCacheEntry(address); // Used for functional accesses
+      // L2 supplies data (functional accesses only look in L2, ok because L1
+      //                   writes through to L2)
+      assert(is_valid(l2entry));
+      sequencer1.readCallback(address,
+                              l2entry.DataBlk,
+                              false,
+                              machineIDToMachineType(in_msg.Sender),
+                              in_msg.InitialRequestTime,
+                              in_msg.ForwardRequestTime,
+                              in_msg.ProbeRequestStartTime);
+    }
+  }
+
+  action(s0_storeDone, "s0", desc="Cluster 0 store done") {
+    Entry entry := getL1CacheEntry(address, 0);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    sequencer.writeCallback(address,
+                            cache_entry.DataBlk,
+                            true,
+                            testAndClearLocalHit(entry));
+    cache_entry.Dirty := true;
+    entry.DataBlk := cache_entry.DataBlk;
+    entry.Dirty := true;
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+  }
+
+  action(s1_storeDone, "s1", desc="Cluster 1 store done") {
+    Entry entry := getL1CacheEntry(address, 1);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    sequencer1.writeCallback(address,
+                             cache_entry.DataBlk,
+                             true,
+                             testAndClearLocalHit(entry));
+    cache_entry.Dirty := true;
+    entry.Dirty := true;
+    entry.DataBlk := cache_entry.DataBlk;
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+  }
+
+  action(xs0_storeDone, "xs0", desc="Cluster 0 store done") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getL1CacheEntry(address, 0);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+             (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      sequencer.writeCallback(address,
+                              cache_entry.DataBlk,
+                              false,
+                              machineIDToMachineType(in_msg.Sender),
+                              in_msg.InitialRequestTime,
+                              in_msg.ForwardRequestTime,
+                              in_msg.ProbeRequestStartTime);
+      cache_entry.Dirty := true;
+      entry.Dirty := true;
+      entry.DataBlk := cache_entry.DataBlk;
+      DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    }
+  }
+
+  action(xs1_storeDone, "xs1", desc="Cluster 1 store done") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getL1CacheEntry(address, 1);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+             (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      sequencer1.writeCallback(address,
+                               cache_entry.DataBlk,
+                               false,
+                               machineIDToMachineType(in_msg.Sender),
+                               in_msg.InitialRequestTime,
+                               in_msg.ForwardRequestTime,
+                               in_msg.ProbeRequestStartTime);
+      cache_entry.Dirty := true;
+      entry.Dirty := true;
+      entry.DataBlk := cache_entry.DataBlk;
+      DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    }
+  }
+
+  action(forward_eviction_to_cpu0, "fec0", desc="sends eviction information to processor0") {
+    if (send_evictions) {
+      DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address);
+      sequencer.evictionCallback(address);
+    }
+  }
+
+  action(forward_eviction_to_cpu1, "fec1", desc="sends eviction information to processor1") {
+    if (send_evictions) {
+      DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address);
+      sequencer1.evictionCallback(address);
+    }
+  }
+
+  action(ci_copyL2ToL1, "ci", desc="copy L2 data to L1") {
+    Entry entry := getICacheEntry(address);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    entry.Dirty := cache_entry.Dirty;
+    entry.DataBlk := cache_entry.DataBlk;
+    entry.FromL2 := true;
+  }
+
+  action(c0_copyL2ToL1, "c0", desc="copy L2 data to L1") {
+    Entry entry := getL1CacheEntry(address, 0);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    entry.Dirty := cache_entry.Dirty;
+    entry.DataBlk := cache_entry.DataBlk;
+    entry.FromL2 := true;
+  }
+
+  action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") {
+    peek(responseToCore_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:StaleNotif;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(map_Address_to_Directory(address));
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(c1_copyL2ToL1, "c1", desc="copy L2 data to L1") {
+    Entry entry := getL1CacheEntry(address, 1);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    entry.Dirty := cache_entry.Dirty;
+    entry.DataBlk := cache_entry.DataBlk;
+    entry.FromL2 := true;
+  }
+
+  action(fi_L2ToL1, "fi", desc="L2 to L1 inst fill") {
+    enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+      out_msg.addr := address;
+      out_msg.Type := TriggerType:L2_to_L1;
+      out_msg.Dest := CacheId:L1I;
+    }
+  }
+
+  action(f0_L2ToL1, "f0", desc="L2 to L1 data fill") {
+    enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+      out_msg.addr := address;
+      out_msg.Type := TriggerType:L2_to_L1;
+      out_msg.Dest := CacheId:L1D0;
+    }
+  }
+
+  action(f1_L2ToL1, "f1", desc="L2 to L1 data fill") {
+    enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+      out_msg.addr := address;
+      out_msg.Type := TriggerType:L2_to_L1;
+      out_msg.Dest := CacheId:L1D1;
+    }
+  }
+
+  action(wi_writeIcache, "wi", desc="write data to icache (and l2)") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getICacheEntry(address);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      entry.DataBlk := in_msg.DataBlk;
+      entry.Dirty := in_msg.Dirty;
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(w0_writeDcache, "w0", desc="write data to dcache 0 (and l2)") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getL1CacheEntry(address, 0);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      entry.DataBlk := in_msg.DataBlk;
+      entry.Dirty := in_msg.Dirty;
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(w1_writeDcache, "w1", desc="write data to dcache 1 (and l2)") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getL1CacheEntry(address, 1);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      entry.DataBlk := in_msg.DataBlk;
+      entry.Dirty := in_msg.Dirty;
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(wb_data, "wb", desc="write back data") {
+    peek(responseToCore_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:CPUData;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(map_Address_to_Directory(address));
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        if (tbe.Shared) {
+          out_msg.NbReqShared := true;
+        } else {
+          out_msg.NbReqShared := false;
+        }
+        out_msg.State := CoherenceState:Shared; // faux info
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.Dirty := false;
+      out_msg.Ntsl := true;
+      out_msg.Hit := false;
+      APPEND_TRANSITION_COMMENT("Setting Ms");
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(ph_sendProbeResponseHit, "ph", desc="send probe ack PrbShrData, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      assert(addressInCore(address) || is_valid(tbe));
+      out_msg.Dirty := false;  // only true if sending back data i think
+      out_msg.Hit := true;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(pb_sendProbeResponseBackprobe, "pb", desc="send probe ack PrbShrData, no data, check for L1 residence") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      if (addressInCore(address)) {
+        out_msg.Hit := true;
+      } else {
+        out_msg.Hit := false;
+      }
+      out_msg.Dirty := false;  // not sending back data, so def. not dirty
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.DataBlk := cache_entry.DataBlk;
+      assert(cache_entry.Dirty);
+      out_msg.Dirty := true;
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.DataBlk := cache_entry.DataBlk;
+      assert(cache_entry.Dirty);
+      out_msg.Dirty := true;
+      out_msg.Hit := true;
+      APPEND_TRANSITION_COMMENT("Setting Ms");
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(tbe));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.DataBlk := tbe.DataBlk;
+      assert(tbe.Dirty);
+      out_msg.Dirty := true;
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(ra_sendReplAck, "ra", desc="Send ack to r-buf that line is replaced if needed") {
+    if (is_invalid(tbe) || tbe.AckNeeded) {
+      enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceRequestType:InvAck;
+        out_msg.Requestor := machineID;
+        out_msg.Destination.add(getPeer(machineID));
+        out_msg.MessageSize := MessageSizeType:Request_Control;
+      }
+      APPEND_TRANSITION_COMMENT(" Sending ack to r-buf ");
+    } else {
+      APPEND_TRANSITION_COMMENT(" NOT Sending ack to r-buf ");
+    }
+  }
+
+  action(m_markAckNeeded, "m", desc="Mark TBE to send ack when deallocated") {
+    assert(is_valid(tbe));
+    tbe.AckNeeded := true;
+  }
+
+  action(mc_cancelWB, "mc", desc="send writeback cancel to L3") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUCancelWB;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.Sender := machineID;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(s_setSharedFlip, "s", desc="hit by shared probe, status may be different") {
+    assert(is_valid(tbe));
+    tbe.Shared := true;
+  }
+
+  action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+    enqueue(unblockNetwork_out, UnblockMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      out_msg.wasValid := isValid(address);
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(sdv_sendDoneValid, "sdv", desc="Request finished, send done ack") {
+    enqueue(unblockNetwork_out, UnblockMsg, 1) {
+      out_msg.addr := address;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.DoneAck := true;
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      if (is_valid(tbe)) {
+          out_msg.Dirty := tbe.Dirty;
+      } else if (is_valid(cache_entry)) {
+          out_msg.Dirty := cache_entry.Dirty;
+      } else {
+          out_msg.Dirty := false;
+      }
+      out_msg.validToInvalid := false;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(sdi_sendDoneInvalid, "sdi", desc="Request finished, send done ack") {
+    enqueue(unblockNetwork_out, UnblockMsg, 1) {
+      out_msg.addr := address;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.DoneAck := true;
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      if (is_valid(tbe)) {
+          out_msg.Dirty := tbe.Dirty;
+      } else if (is_valid(cache_entry)) {
+          out_msg.Dirty := cache_entry.Dirty;
+      } else {
+          out_msg.Dirty := false;
+      }
+      out_msg.validToInvalid := true;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(l10m_profileMiss, "l10m", desc="l10m miss profile") {
+    ++L1D0cache.demand_misses;
+  }
+
+  action(l11m_profileMiss, "l11m", desc="l11m miss profile") {
+    ++L1D1cache.demand_misses;
+  }
+
+  action(l1im_profileMiss, "l1lm", desc="l1im miss profile") {
+    ++L1Icache.demand_misses;
+  }
+
+  action(l2m_profileMiss, "l2m", desc="l2m miss profile") {
+    ++L2cache.demand_misses;
+  }
+
+  action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") {
+    probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
+    mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+  // END ACTIONS
+
+  // BEGIN TRANSITIONS
+
+  // transitions from base
+  transition(I, C0_Load_L1miss, I_E0S) {L1D0TagArrayRead, L2TagArrayRead} {
+    // track misses, if implemented
+    // since in I state, L2 miss as well
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    l1im_profileMiss;
+    a2_allocateL2;
+    i1_invCluster;
+    ii_invIcache;
+    n_issueRdBlk;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, C1_Load_L1miss, I_E1S) {L1D1TagArrayRead, L2TagArrayRead} {
+    // track misses, if implemented
+    // since in I state, L2 miss as well
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    a2_allocateL2;
+    i0_invCluster;
+    ii_invIcache;
+    n_issueRdBlk;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, Ifetch0_L1miss, S0) {L1ITagArrayRead, L2TagArrayRead} {
+    // track misses, if implemented
+    // L2 miss as well
+    l10m_profileMiss;
+    l2m_profileMiss;
+    l1im_profileMiss;
+    ai_allocateL1I;
+    a2_allocateL2;
+    ib_invBothClusters;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} {
+     l11m_profileMiss;
+    // track misses, if implemented
+    // L2 miss as well
+    l2m_profileMiss;
+    l1im_profileMiss;
+    ai_allocateL1I;
+    a2_allocateL2;
+    ib_invBothClusters;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, C0_Store_L1miss, I_M0) {L1D0TagArrayRead,L2TagArrayRead} {
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    a2_allocateL2;
+    i1_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, C1_Store_L1miss, I_M1) {L1D0TagArrayRead, L2TagArrayRead} {
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    a2_allocateL2;
+    i0_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, C0_Load_L1miss, S_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, C1_Load_L1miss, S_F1) {L1D1TagArrayRead,  L2TagArrayRead, L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, Ifetch0_L1miss, Si_F0) {L1ITagArrayRead,L2TagArrayRead, L2DataArrayRead} {
+    l1im_profileMiss;
+    ai_allocateL1I;
+    fi_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, Ifetch1_L1miss, Si_F1) {L1ITagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+    l1im_profileMiss;
+    ai_allocateL1I;
+    fi_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition({S}, {C0_Store_L1hit, C0_Store_L1miss}, S_M0) {L1D0TagArrayRead, L2TagArrayRead}{
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    i1_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition({S}, {C1_Store_L1hit, C1_Store_L1miss}, S_M1) {L1D1TagArrayRead,L2TagArrayRead} {
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    i0_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+  transition(Es, C0_Load_L1miss, Es_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {  // can this be folded with S_F?
+     l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, C1_Load_L1miss, Es_F1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} {  // can this be folded with S_F?
+     l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, Ifetch0_L1miss, S0) {L1ITagArrayRead, L2TagArrayRead} {
+      l1im_profileMiss;
+     i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    ib_invBothClusters;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} {
+     l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    ib_invBothClusters;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  // THES SHOULD NOT BE INSTANTANEOUS BUT OH WELL FOR NOW
+  transition(Es, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayWrite,L1D0TagArrayRead, L2TagArrayRead, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} {
+    a0_allocateL1D;
+    i1_invCluster;
+    s0_storeDone;   // instantaneous L1/L2 dirty - no writethrough delay
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} {
+    a1_allocateL1D;
+    i0_invCluster;
+    s1_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, C0_Load_L1miss, E0_F) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+     l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, C1_Load_L1miss, E0_Es) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+     l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, Ifetch0_L1miss, S0) {L2TagArrayRead, L1ITagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    i0_invCluster;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead } {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    i0_invCluster;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a0_allocateL1D;
+    s0_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, C1_Store_L1miss, M1) {L1D0TagArrayRead, L1D0TagArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} {
+    a1_allocateL1D;
+    l11m_profileMiss;
+    i0_invCluster;
+    s1_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, C1_Load_L1miss, E1_F) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+    a1_allocateL1D;
+    l11m_profileMiss;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, C0_Load_L1miss, E1_Es) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+    a0_allocateL1D;
+    l10m_profileMiss;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    i1_invCluster;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, Ifetch0_L1miss, S0) {L2TagArrayRead,L1ITagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    i1_invCluster;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a1_allocateL1D;
+    s1_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, C0_Store_L1miss, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} {
+     l10m_profileMiss;
+    a0_allocateL1D;
+    i1_invCluster;
+    s0_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({O}, {C0_Store_L1hit, C0_Store_L1miss}, O_M0) {L1D0TagArrayRead, L2TagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue CtoD
+     l10m_profileMiss;
+    a0_allocateL1D;
+    i1_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition({O}, {C1_Store_L1hit, C1_Store_L1miss}, O_M1) {L1D1TagArrayRead, L2TagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l11m_profileMiss;
+    a1_allocateL1D;
+    i0_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(O, C0_Load_L1miss, O_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(O, C1_Load_L1miss, O_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(Ms, C0_Load_L1miss, Ms_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(Ms, C1_Load_L1miss, Ms_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition({Ms, M0, M1, O}, Ifetch0_L1miss, MO_S0) {L1ITagArrayRead,  L2TagArrayRead} {
+    l2m_profileMiss;  // permissions miss
+    l1im_profileMiss;
+    ai_allocateL1I;
+    t_allocateTBE;
+    ib_invBothClusters;
+    vd_victim;
+//    i2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition({Ms, M0, M1, O}, Ifetch1_L1miss, MO_S1) {L1ITagArrayRead L2TagArrayRead } {
+    l2m_profileMiss;  // permissions miss
+    l10m_profileMiss;
+    ai_allocateL1I;
+    t_allocateTBE;
+    ib_invBothClusters;
+    vd_victim;
+//    i2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition(Ms, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a0_allocateL1D;
+    i1_invCluster;
+    s0_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(Ms, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a1_allocateL1D;
+    i0_invCluster;
+    s1_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(M0, C0_Load_L1miss, M0_F) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+   l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(M0, C1_Load_L1miss, M0_Ms) {L2TagArrayRead, L2DataArrayRead,L1D1TagArrayRead} {
+   l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(M0, {C0_Store_L1hit, C0_Store_L1miss}) {L1D0TagArrayRead, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead} {
+    a0_allocateL1D;
+    s0_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(M0, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead, L2TagArrayWrite} {
+    a1_allocateL1D;
+    i0_invCluster;
+    s1_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(M1, C0_Load_L1miss, M1_Ms) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(M1, C1_Load_L1miss, M1_F) {L1D1TagArrayRead L2TagArrayRead, L2DataArrayRead} {
+   l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(M1, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a0_allocateL1D;
+    i1_invCluster;
+    s0_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(M1, {C1_Store_L1hit, C1_Store_L1miss}) {L1D1TagArrayRead, L1D1DataArrayWrite, L2TagArrayRead, L2DataArrayWrite} {
+    a1_allocateL1D;
+    s1_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  // end transitions from base
+
+  // Begin simple hit transitions
+  transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es,
+          Ms_F1, M0_Ms}, C0_Load_L1hit) {L1D0TagArrayRead, L1D0DataArrayRead} {
+    // track hits, if implemented
+    l0_loadDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es,
+          Ms_F0, M1_Ms}, C1_Load_L1hit) {L1D1TagArrayRead, L1D1DataArrayRead} {
+    // track hits, if implemented
+    l1_loadDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({S, S_C, S_F0, S_F1, S_F}, Ifetch0_L1hit) {L1ITagArrayRead, L1IDataArrayRead} {
+    // track hits, if implemented
+    il0_loadDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({S, S_C, S_F0, S_F1, S_F}, Ifetch1_L1hit) {L1ITagArrayRead, L1IDataArrayWrite} {
+    // track hits, if implemented
+    il1_loadDone;
+    p_popMandatoryQueue;
+  }
+
+  // end simple hit transitions
+
+  // Transitions from transient states
+
+  // recycles
+  transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+          IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F,
+          E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, C0_Load_L1hit) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M1,
+          O_M1, S0, S1, I_C, S0_C, S1_C, S_C}, C0_Load_L1miss) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+          IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F,
+          E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, C1_Load_L1hit) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M0,
+          O_M0, S0, S1, I_C, S0_C, S1_C, S_C},  C1_Load_L1miss) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, {Ifetch0_L1hit, Ifetch1_L1hit}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES,
+          IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, ES_I, MO_I, S_F0, S_F1, S_F,
+          O_F0, O_F1, O_F, S_M0, S_M1, O_M0, O_M1, Es_F0, Es_F1, Es_F, E0_F,
+          E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, I_C,
+          S_C}, {Ifetch0_L1miss, Ifetch1_L1miss}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_E1S, IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, S_F1, O_F1,
+          Si_F0, Si_F1, S_M1, O_M1, S0, S1, Es_F1, E1_F, E0_Es, Ms_F1, M0_Ms,
+          M1_F, I_C, S0_C, S1_C, S_C}, {C0_Store_L1miss}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_E0S, IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1 S_F0, O_F0,
+          Si_F0, Si_F1, S_M0, O_M0, S0, S1, Es_F0, E0_F, E1_Es, Ms_F0, M0_F,
+          M1_Ms, I_C, S0_C, S1_C, S_C}, {C1_Store_L1miss}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+          IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M0, O_M0, Es_F0, Es_F1, Es_F, E0_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_Ms}, {C0_Store_L1hit}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+          IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M1,
+          O_M1, Es_F0, Es_F1, Es_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F,
+          M0_Ms, M1_F, M1_Ms}, {C1_Store_L1hit}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+          IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F,
+          E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, L1D0_Repl) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+          IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F,
+          E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, L1D1_Repl) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, L1I_Repl) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({S_C, S0_C, S1_C, S0, S1, Si_F0, Si_F1, I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, S_M0, O_M0, S_M1, O_M1, Es_F0, Es_F1, Es_F, E0_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, MO_S0, MO_S1, IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, L2_Repl) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, {NB_AckS,
+          PrbInvData, PrbInvDataDemand, PrbInv, PrbShrData, PrbShrDataDemand}) {} {
+    zz_recycleMandatoryQueue;  // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary.
+  }
+
+  transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES}, NB_AckE) {} {
+    zz_recycleMandatoryQueue;  // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary.
+  }
+
+  transition({E0_Es, E1_F, Es_F1}, C0_Load_L1miss, Es_F) {L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(S_F1, C0_Load_L1miss, S_F) {L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(O_F1, C0_Load_L1miss, O_F) {L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition({Ms_F1, M0_Ms, M1_F}, C0_Load_L1miss, Ms_F) {L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_M0, C1_Load_L1miss, I_M0Ms){
+    l11m_profileMiss;
+    l2m_profileMiss;
+    a1_allocateL1D;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_M1, C0_Load_L1miss, I_M1Ms){
+    l10m_profileMiss;
+    l2m_profileMiss;
+    a0_allocateL1D;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_M0, C1_Store_L1miss, I_M0M1) {
+    l11m_profileMiss;
+    l2m_profileMiss;
+    a1_allocateL1D;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_M1, C0_Store_L1miss, I_M1M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L2TagArrayRead, L2TagArrayWrite} {
+    l2m_profileMiss;
+    a0_allocateL1D;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_E0S, C1_Load_L1miss, I_ES) {} {
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_E1S, C0_Load_L1miss, I_ES) {} {
+    l2m_profileMiss;
+    l10m_profileMiss;
+    l2m_profileMiss;
+    a0_allocateL1D;
+    p_popMandatoryQueue;
+  }
+
+  transition({E1_Es, E0_F, Es_F0}, C1_Load_L1miss, Es_F) {L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(S_F0, C1_Load_L1miss, S_F) { L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(O_F0, C1_Load_L1miss, O_F) {L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition({Ms_F0, M1_Ms, M0_F}, C1_Load_L1miss, Ms_F) {L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es, Ms_F1, M0_Ms}, L1D0_Repl) {L1D0TagArrayRead} {
+    i0_invCluster;
+  }
+
+  transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es, Ms_F0, M1_Ms}, L1D1_Repl) {L1D1TagArrayRead} {
+    i1_invCluster;
+  }
+
+  transition({S, S_C, S_F0, S_F1}, L1I_Repl) {L1ITagArrayRead} {
+    ii_invIcache;
+  }
+
+  transition({S, E0, E1, Es}, L2_Repl, ES_I) {L2TagArrayRead,L1D0TagArrayRead, L1D1TagArrayRead, L1ITagArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    t_allocateTBE;
+    vc_victim;
+    ib_invBothClusters;
+    i2_invL2;
+    ii_invIcache;
+  }
+
+  transition({Ms, M0, M1, O}, L2_Repl, MO_I) {L2TagArrayRead, L2TagArrayWrite, L1D0TagArrayRead, L1D1TagArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    t_allocateTBE;
+    vd_victim;
+    i2_invL2;
+    ib_invBothClusters;  // nothing will happen for D0 on M1, vice versa
+  }
+
+  transition(S0, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    wi_writeIcache;
+    xi0_loadDone;
+    uu_sendUnblock;
+    sdv_sendDoneValid;
+    pr_popResponseQueue;
+  }
+
+  transition(S1, NB_AckS, S) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    wi_writeIcache;
+    xi1_loadDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(S0_C, NB_AckS, S_C) { L1IDataArrayWrite,L2DataArrayWrite} {
+    // does not need send done since the rdblks was "sinked"
+    wi_writeIcache;
+    xi0_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(S1_C, NB_AckS, S_C) { L1D1DataArrayWrite,L2DataArrayWrite} {
+    wi_writeIcache;
+    xi1_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M0, NB_AckM, M0) { L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w0_writeDcache;
+    xs0_storeDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} {
+    w1_writeDcache;
+    xs1_storeDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  // THESE MO->M1 should not be instantaneous but oh well for now.
+  transition(I_M0M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} {
+    w0_writeDcache;
+    xs0_storeDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    i0_invCluster;
+    s1_storeDone;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M1M0, NB_AckM, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} {
+    w1_writeDcache;
+    xs1_storeDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    i1_invCluster;
+    s0_storeDone;
+    pr_popResponseQueue;
+  }
+
+  // Above shoudl be more like this, which has some latency to xfer to L1
+  transition(I_M0Ms, NB_AckM, M0_Ms) {L1D0DataArrayWrite,L2DataArrayWrite} {
+    w0_writeDcache;
+    xs0_storeDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    f1_L2ToL1;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M1Ms, NB_AckM, M1_Ms) {L1D1DataArrayWrite,L2DataArrayWrite} {
+    w1_writeDcache;
+    xs1_storeDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    f0_L2ToL1;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E0S, NB_AckE, E0) {L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w0_writeDcache;
+    xl0_loadDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E1S, NB_AckE, E1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w1_writeDcache;
+    xl1_loadDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_ES, NB_AckE, Es) {L1D1DataArrayWrite, L1D1TagArrayWrite, L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite } {
+    w0_writeDcache;
+    xl0_loadDone;
+    w1_writeDcache;
+    xl1_loadDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E0S, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w0_writeDcache;
+    xl0_loadDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E1S, NB_AckS, S) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} {
+    w1_writeDcache;
+    xl1_loadDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_ES, NB_AckS, S) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite,  L2DataArrayWrite} {
+    w0_writeDcache;
+    xl0_loadDone;
+    w1_writeDcache;
+    xl1_loadDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(S_F0, L2_to_L1D0, S) {L1D0TagArrayWrite, L1D0DataArrayWrite,  L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(S_F1, L2_to_L1D1, S) {L1D1TagArrayWrite, L1D1DataArrayWrite,  L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Si_F0, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite,  L2TagArrayWrite, L2DataArrayRead} {
+    ci_copyL2ToL1;
+    il0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Si_F1, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite,  L2TagArrayWrite, L2DataArrayRead} {
+    ci_copyL2ToL1;
+    il1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(S_F, L2_to_L1D0, S_F1) { L1D0DataArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(S_F, L2_to_L1D1, S_F0) { L1D1DataArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(O_F0, L2_to_L1D0, O) { L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(O_F1, L2_to_L1D1, O) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(O_F, L2_to_L1D0, O_F1) { L1D0DataArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(O_F, L2_to_L1D1, O_F0) { L1D1DataArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(M1_F, L2_to_L1D1, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(M0_F, L2_to_L1D0, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Ms_F0, L2_to_L1D0, Ms) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Ms_F1, L2_to_L1D1, Ms) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Ms_F, L2_to_L1D0, Ms_F1) {L1D0DataArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Ms_F, L2_to_L1D1, Ms_F0) {L1IDataArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(M1_Ms, L2_to_L1D0, Ms) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(M0_Ms, L2_to_L1D1, Ms) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Es_F0, L2_to_L1D0, Es) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Es_F1, L2_to_L1D1, Es) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Es_F, L2_to_L1D0, Es_F1) {L2TagArrayRead, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Es_F, L2_to_L1D1, Es_F0) {L2TagArrayRead, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(E0_F, L2_to_L1D0, E0) {L2TagArrayRead, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(E1_F, L2_to_L1D1, E1) {L2TagArrayRead, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(E1_Es, L2_to_L1D0, Es) {L2TagArrayRead, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(E0_Es, L2_to_L1D1, Es) {L2TagArrayRead, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(IF_E0S, L2_to_L1D0, I_E0S) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF_E1S, L2_to_L1D1, I_E1S) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF_ES, L2_to_L1D0, IF1_ES) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF_ES, L2_to_L1D1, IF0_ES) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF0_ES, L2_to_L1D0, I_ES) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF1_ES, L2_to_L1D1, I_ES) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(F_S0, L2_to_L1I, S0) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(F_S1, L2_to_L1I, S1) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition({S_M0, O_M0}, NB_AckM, M0) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    xs0_storeDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition({S_M1, O_M1}, NB_AckM, M1) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    xs1_storeDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(MO_I, NB_AckWB, I) {L2TagArrayWrite} {
+    wb_data;
+    ra_sendReplAck;
+    sdi_sendDoneInvalid;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(ES_I, NB_AckWB, I) {L2TagArrayWrite} {
+    wb_data;
+    ra_sendReplAck;
+    sdi_sendDoneInvalid;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(MO_S0, NB_AckWB, S0) {L2TagArrayWrite} {
+    wb_data;
+    i2_invL2;
+    a2_allocateL2;
+    sdv_sendDoneValid;
+    nS_issueRdBlkS;
+    d_deallocateTBE; // FOO
+    pr_popResponseQueue;
+  }
+
+  transition(MO_S1, NB_AckWB, S1) {L2TagArrayWrite} {
+    wb_data;
+    i2_invL2;
+    a2_allocateL2;
+    sdv_sendDoneValid;
+    nS_issueRdBlkS;
+    d_deallocateTBE; // FOO
+    pr_popResponseQueue;
+  }
+
+  // Writeback cancel "ack"
+  transition(I_C, NB_AckWB, I) {L2TagArrayWrite} {
+    ss_sendStaleNotification;
+    sdi_sendDoneInvalid;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(S0_C, NB_AckWB, S0) {L2TagArrayWrite} {
+    ss_sendStaleNotification;
+    sdv_sendDoneValid;
+    pr_popResponseQueue;
+  }
+
+  transition(S1_C, NB_AckWB, S1) {L2TagArrayWrite} {
+    ss_sendStaleNotification;
+    sdv_sendDoneValid;
+    pr_popResponseQueue;
+  }
+
+  transition(S_C, NB_AckWB, S) {L2TagArrayWrite} {
+    ss_sendStaleNotification;
+    sdv_sendDoneValid;
+    pr_popResponseQueue;
+  }
+
+  // Begin Probe Transitions
+
+  transition({Ms, M0, M1, O}, {PrbInvData, PrbInvDataDemand}, I) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pd_sendProbeResponseData;
+    i2_invL2;
+    ib_invBothClusters;
+    pp_popProbeQueue;
+  }
+
+  transition({Es, E0, E1, S, I}, {PrbInvData, PrbInvDataDemand}, I) {L2TagArrayRead, L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    ib_invBothClusters;
+    ii_invIcache;  // only relevant for S
+    pp_popProbeQueue;
+  }
+
+  transition(S_C, {PrbInvData, PrbInvDataDemand}, I_C) {L2TagArrayWrite} {
+    t_allocateTBE;
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, {PrbInvData, PrbInvDataDemand}, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms, M0, M1, O, Es, E0, E1, S, I}, PrbInv, I) {L2TagArrayRead, L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2; // nothing will happen in I
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(S_C, PrbInv, I_C) {L2TagArrayWrite} {
+    t_allocateTBE;
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbInv, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms, M0, M1, O}, {PrbShrData, PrbShrDataDemand}, O) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({Es, E0, E1, S}, {PrbShrData, PrbShrDataDemand}, S) {L2TagArrayRead, L2TagArrayWrite} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition(S_C, {PrbShrData, PrbShrDataDemand}) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition({I, I_C}, {PrbShrData, PrbShrDataDemand}) {L2TagArrayRead} {
+    pb_sendProbeResponseBackprobe;
+    pp_popProbeQueue;
+  }
+
+  transition({I_M0, I_E0S}, {PrbInv, PrbInvData, PrbInvDataDemand}) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;  // must invalidate current data (only relevant for I_M0)
+    a0_allocateL1D;  // but make sure there is room for incoming data when it arrives
+    pp_popProbeQueue;
+  }
+
+  transition({I_M1, I_E1S}, {PrbInv, PrbInvData, PrbInvDataDemand}) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters; // must invalidate current data (only relevant for I_M1)
+    a1_allocateL1D;  // but make sure there is room for incoming data when it arrives
+    pp_popProbeQueue;
+  }
+
+  transition({I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_ES}, {PrbInv, PrbInvData, PrbInvDataDemand, PrbShrData, PrbShrDataDemand}) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    a0_allocateL1D;
+    a1_allocateL1D;
+    pp_popProbeQueue;
+  }
+
+  transition({I_M0, I_E0S, I_M1, I_E1S}, {PrbShrData, PrbShrDataDemand}) {} {
+    pb_sendProbeResponseBackprobe;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, {PrbInvData, PrbInvDataDemand}, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, {PrbInvData, PrbInvDataDemand}, I_C) {} {
+    pdt_sendProbeResponseDataFromTBE;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbInv, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, PrbInv, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, {PrbShrData, PrbShrDataDemand}, ES_I) {} {
+    ph_sendProbeResponseHit;
+    s_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, {PrbShrData, PrbShrDataDemand}, MO_I) {} {
+    pdt_sendProbeResponseDataFromTBE;
+    s_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_S0, {PrbInvData, PrbInvDataDemand}, S0_C) {L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pdt_sendProbeResponseDataFromTBE;
+    i2_invL2;
+    a2_allocateL2;
+    nS_issueRdBlkS;
+    d_deallocateTBE;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_S1, {PrbInvData, PrbInvDataDemand}, S1_C) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pdt_sendProbeResponseDataFromTBE;
+    i2_invL2;
+    a2_allocateL2;
+    nS_issueRdBlkS;
+    d_deallocateTBE;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_S0, PrbInv, S0_C) {L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    a2_allocateL2;
+    nS_issueRdBlkS;
+    d_deallocateTBE;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_S1, PrbInv, S1_C) {L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    a2_allocateL2;
+    nS_issueRdBlkS;
+    d_deallocateTBE;
+    pp_popProbeQueue;
+  }
+
+  transition({MO_S0, MO_S1}, {PrbShrData, PrbShrDataDemand}) {} {
+    pdt_sendProbeResponseDataFromTBE;
+    s_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+  transition({S_F0, Es_F0, E0_F, E1_Es}, {PrbInvData, PrbInvDataDemand, PrbInv}, IF_E0S) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    // invalidate everything you've got
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    // but make sure you have room for what you need from the fill
+    a0_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({S_F1, Es_F1, E1_F, E0_Es}, {PrbInvData, PrbInvDataDemand, PrbInv}, IF_E1S) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    // invalidate everything you've got
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    // but make sure you have room for what you need from the fill
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({S_F, Es_F}, {PrbInvData, PrbInvDataDemand, PrbInv}, IF_ES) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    // invalidate everything you've got
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    // but make sure you have room for what you need from the fill
+    a0_allocateL1D;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition(Si_F0, {PrbInvData, PrbInvDataDemand, PrbInv}, F_S0) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    nS_issueRdBlkS;
+    pp_popProbeQueue;
+  }
+
+  transition(Si_F1, {PrbInvData, PrbInvDataDemand, PrbInv}, F_S1) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    nS_issueRdBlkS;
+    pp_popProbeQueue;
+  }
+
+  transition({Es_F0, E0_F, E1_Es}, {PrbShrData, PrbShrDataDemand}, S_F0) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition({Es_F1, E1_F, E0_Es}, {PrbShrData, PrbShrDataDemand}, S_F1) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition(Es_F, {PrbShrData, PrbShrDataDemand}, S_F) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition({S_F0, S_F1, S_F, Si_F0, Si_F1}, {PrbShrData, PrbShrDataDemand}) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition(S_M0, {PrbInvData, PrbInvDataDemand}, I_M0) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pim_sendProbeResponseInvMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition(O_M0, {PrbInvData, PrbInvDataDemand}, I_M0) {L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pdm_sendProbeResponseDataMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S_M0, O_M0}, {PrbInv}, I_M0) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pim_sendProbeResponseInvMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition(S_M1, {PrbInvData, PrbInvDataDemand}, I_M1) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pim_sendProbeResponseInvMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition(O_M1, {PrbInvData, PrbInvDataDemand}, I_M1) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pdm_sendProbeResponseDataMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S_M1, O_M1}, {PrbInv}, I_M1) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pim_sendProbeResponseInvMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S0, S0_C}, {PrbInvData, PrbInvDataDemand, PrbInv}) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S1, S1_C}, {PrbInvData, PrbInvDataDemand, PrbInv}) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S_M0, S_M1}, {PrbShrData, PrbShrDataDemand}) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition({O_M0, O_M1}, {PrbShrData, PrbShrDataDemand}) {L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({S0, S1, S0_C, S1_C}, {PrbShrData, PrbShrDataDemand}) {} {
+    pb_sendProbeResponseBackprobe;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F0, M0_F, M1_Ms, O_F0}, {PrbInvData, PrbInvDataDemand}, IF_E0S) {L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pd_sendProbeResponseData;
+    ib_invBothClusters;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F1, M1_F, M0_Ms, O_F1}, {PrbInvData, PrbInvDataDemand}, IF_E1S) {L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pd_sendProbeResponseData;
+    ib_invBothClusters;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F, O_F}, {PrbInvData, PrbInvDataDemand}, IF_ES) {L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pd_sendProbeResponseData;
+    ib_invBothClusters;
+    i2_invL2;
+    a0_allocateL1D;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F0, M0_F, M1_Ms, O_F0}, PrbInv, IF_E0S) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F1, M1_F, M0_Ms, O_F1}, PrbInv, IF_E1S) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F, O_F}, PrbInv, IF_ES) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    i2_invL2;
+    a0_allocateL1D;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F0, M0_F, M1_Ms}, {PrbShrData, PrbShrDataDemand}, O_F0) {L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F1, M1_F, M0_Ms}, {PrbShrData, PrbShrDataDemand}, O_F1) {} {
+  }
+
+  transition({Ms_F}, {PrbShrData, PrbShrDataDemand}, O_F) {L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({O_F0, O_F1, O_F}, {PrbShrData, PrbShrDataDemand}) {L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  // END TRANSITIONS
+}
+
+
diff --git a/src/mem/protocol/MOESI_AMD_Base-Region-dir.sm b/src/mem/protocol/MOESI_AMD_Base-Region-dir.sm
new file mode 100644
index 000000000..52d87fb8b
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-Region-dir.sm
@@ -0,0 +1,2038 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:Directory, "AMD_Base-like protocol")
+: DirectoryMemory * directory;
+  CacheMemory * L3CacheMemory;
+  Cycles response_latency := 5;
+  Cycles response_latency_regionDir := 1;
+  Cycles l3_hit_latency := 30;
+  bool useL3OnWT := "False";
+  Cycles to_memory_controller_latency := 1;
+
+  // From the Cores
+  MessageBuffer * requestFromCores, network="From", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseFromCores, network="From", virtual_network="2", vnet_type="response";
+  MessageBuffer * unblockFromCores, network="From", virtual_network="4", vnet_type="unblock";
+
+  // To the Cores
+  MessageBuffer * probeToCore, network="To", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseToCore, network="To", virtual_network="2", vnet_type="response";
+
+  // From region buffer
+  MessageBuffer * reqFromRegBuf, network="From", virtual_network="7", vnet_type="request";
+
+  // To Region directory
+  MessageBuffer * reqToRegDir, network="To", virtual_network="5", vnet_type="request";
+  MessageBuffer * reqFromRegDir, network="From", virtual_network="5", vnet_type="request";
+  MessageBuffer * unblockToRegDir, network="To", virtual_network="4", vnet_type="unblock";
+
+  MessageBuffer * triggerQueue;
+  MessageBuffer * L3triggerQueue;
+  MessageBuffer * responseFromMemory;
+{
+  // STATES
+  state_declaration(State, desc="Directory states", default="Directory_State_U") {
+    U, AccessPermission:Backing_Store,                 desc="unblocked";
+    BR, AccessPermission:Backing_Store,                  desc="got CPU read request, blocked while sent to L3";
+    BW, AccessPermission:Backing_Store,                  desc="got CPU write request, blocked while sent to L3";
+    BL, AccessPermission:Busy,                  desc="got L3 WB request";
+    // BL is Busy because it's possible for the data only to be in the network
+    // in the WB, L3 has sent it and gone on with its business in possibly I
+    // state.
+    BI, AccessPermission:Backing_Store,                   desc="Blocked waiting for inv ack from core";
+    BS_M, AccessPermission:Backing_Store,                 desc="blocked waiting for memory";
+    BM_M, AccessPermission:Backing_Store,                 desc="blocked waiting for memory";
+    B_M, AccessPermission:Backing_Store,                 desc="blocked waiting for memory";
+    BP, AccessPermission:Backing_Store,                 desc="blocked waiting for probes, no need for memory";
+    BS_PM, AccessPermission:Backing_Store,                desc="blocked waiting for probes and Memory";
+    BM_PM, AccessPermission:Backing_Store,                desc="blocked waiting for probes and Memory";
+    B_PM, AccessPermission:Backing_Store,                desc="blocked waiting for probes and Memory";
+    BS_Pm, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    BM_Pm, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    B_Pm, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    B, AccessPermission:Backing_Store,                  desc="sent response, Blocked til ack";
+
+    // These are needed for when a private requests was issued before an inv was received
+    // for writebacks
+    BS_Pm_BL, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    BM_Pm_BL, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    B_Pm_BL, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    BP_BL, AccessPermission:Backing_Store,                 desc="blocked waiting for probes, no need for memory";
+    // for reads
+    BS_Pm_B, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    BM_Pm_B, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    B_Pm_B, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    BP_B, AccessPermission:Backing_Store,                 desc="blocked waiting for probes, no need for memory";
+  }
+
+  // Events
+  enumeration(Event, desc="Directory events") {
+    // CPU requests
+    RdBlkS,             desc="...";
+    RdBlkM,             desc="...";
+    RdBlk,              desc="...";
+    WriteThrough,       desc="WriteThrough Message";
+    Atomic,             desc="Atomic Message";
+
+    RdBlkSP,             desc="...";
+    RdBlkMP,             desc="...";
+    RdBlkP,              desc="...";
+    VicDirtyP,           desc="...";
+    VicCleanP,           desc="...";
+    WriteThroughP,       desc="WriteThrough Message";
+    AtomicP,             desc="Atomic Message";
+
+    // writebacks
+    VicDirty,           desc="...";
+    VicClean,           desc="...";
+    CPUData,            desc="WB data from CPU";
+    StaleWB,            desc="WB response for a no longer valid request";
+
+    // probe responses
+    CPUPrbResp,            desc="Probe Response Msg";
+    LastCPUPrbResp,        desc="Last Probe Response Msg";
+
+    ProbeAcksComplete,  desc="Probe Acks Complete";
+
+    L3Hit,              desc="Hit in L3 return data to core";
+
+    // Memory Controller
+    MemData, desc="Fetched data from memory arrives";
+    WBAck, desc="Writeback Ack from memory arrives";
+
+    CoreUnblock,            desc="Core received data, unblock";
+    UnblockWriteThrough,    desc="unblock, self triggered";
+
+    StaleVicDirty,       desc="Core invalidated before VicDirty processed";
+    StaleVicDirtyP,       desc="Core invalidated before VicDirty processed";
+
+    // For region protocol
+    CPUReq,              desc="Generic CPU request";
+    Inv,                 desc="Region dir needs a block invalidated";
+    Downgrade,           desc="Region dir needs a block downgraded";
+
+    // For private accesses (bypassed reg-dir)
+    CPUReadP,            desc="Initial req from core, sent to L3";
+    CPUWriteP,           desc="Initial req from core, sent to L3";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    L3DataArrayRead,    desc="Read the data array";
+    L3DataArrayWrite,   desc="Write the data array";
+    L3TagArrayRead,     desc="Read the data array";
+    L3TagArrayWrite,    desc="Write the data array";
+  }
+
+  // TYPES
+
+  // DirectoryEntry
+  structure(Entry, desc="...", interface="AbstractEntry") {
+    State DirectoryState,          desc="Directory state";
+    DataBlock DataBlk,             desc="data for the block";
+    NetDest VicDirtyIgnore,  desc="VicDirty coming from whom to ignore";
+  }
+
+  structure(CacheEntry, desc="...", interface="AbstractCacheEntry") {
+    DataBlock DataBlk,          desc="data for the block";
+    MachineID LastSender,       desc="Mach which this block came from";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,     desc="Transient state";
+    DataBlock DataBlk,  desc="data for the block";
+    DataBlock DataBlkAux,  desc="Auxiliary data for the block";
+    bool Dirty,         desc="Is the data dirty?";
+    int NumPendingAcks,        desc="num acks expected";
+    MachineID OriginalRequestor,        desc="Original Requestor";
+    MachineID WTRequestor,        desc="WT Requestor";
+    bool Cached,        desc="data hit in Cache";
+    bool MemData,       desc="Got MemData?",default="false";
+    bool wtData,       desc="Got write through data?",default="false";
+    bool atomicData,   desc="Got Atomic op?",default="false";
+    Cycles InitialRequestTime, desc="...";
+    Cycles ForwardRequestTime, desc="...";
+    Cycles ProbeRequestStartTime, desc="...";
+    bool DemandRequest, desc="for profiling";
+    MachineID LastSender, desc="Mach which this block came from";
+    bool L3Hit, default="false", desc="Was this an L3 hit?";
+    bool TriggeredAcksComplete, default="false", desc="True if already triggered acks complete";
+    WriteMask writeMask,    desc="outstanding write through mask";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_tbe(TBE a);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  Entry getDirectoryEntry(Addr addr), return_by_pointer="yes" {
+    Entry dir_entry := static_cast(Entry, "pointer", directory.lookup(addr));
+
+    if (is_valid(dir_entry)) {
+      //DPRINTF(RubySlicc, "Getting entry %s: %s\n", addr, dir_entry.DataBlk);
+      return dir_entry;
+    }
+
+    dir_entry :=  static_cast(Entry, "pointer",
+                              directory.allocate(addr, new Entry));
+    return dir_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if (is_valid(tbe) && tbe.MemData) {
+      DPRINTF(RubySlicc, "Returning DataBlk from TBE %s:%s\n", addr, tbe);
+      return tbe.DataBlk;
+    }
+    DPRINTF(RubySlicc, "Returning DataBlk from Dir %s:%s\n", addr, getDirectoryEntry(addr));
+    return getDirectoryEntry(addr).DataBlk;
+  }
+
+  State getState(TBE tbe, CacheEntry entry, Addr addr) {
+    return getDirectoryEntry(addr).DirectoryState;
+  }
+
+  State getStateFromAddr(Addr addr) {
+    return getDirectoryEntry(addr).DirectoryState;
+  }
+
+  void setState(TBE tbe, CacheEntry entry, Addr addr, State state) {
+    getDirectoryEntry(addr).DirectoryState := state;
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    // For this Directory, all permissions are just tracked in Directory, since
+    // it's not possible to have something in TBE but not Dir, just keep track
+    // of state all in one place.
+    if(directory.isPresent(addr)) {
+      return Directory_State_to_permission(getDirectoryEntry(addr).DirectoryState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  void setAccessPermission(CacheEntry entry, Addr addr, State state) {
+    getDirectoryEntry(addr).changePermission(Directory_State_to_permission(state));
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L3DataArrayRead) {
+      L3CacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L3DataArrayWrite) {
+      L3CacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L3TagArrayRead) {
+      L3CacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L3TagArrayWrite) {
+      L3CacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L3DataArrayRead) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L3DataArrayWrite) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L3TagArrayRead) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L3TagArrayWrite) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  // ** OUT_PORTS **
+  out_port(probeNetwork_out, NBProbeRequestMsg, probeToCore);
+  out_port(responseNetwork_out, ResponseMsg, responseToCore);
+
+  out_port(requestNetworkReg_out, CPURequestMsg, reqToRegDir);
+  out_port(regAckNetwork_out, UnblockMsg, unblockToRegDir);
+
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+  out_port(L3TriggerQueue_out, TriggerMsg, L3triggerQueue);
+
+  // ** IN_PORTS **
+
+  // Trigger Queue
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=7) {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == TriggerType:AcksComplete) {
+          trigger(Event:ProbeAcksComplete, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == TriggerType:UnblockWriteThrough) {
+          trigger(Event:UnblockWriteThrough, in_msg.addr, entry, tbe);
+        } else {
+          error("Unknown trigger msg");
+        }
+      }
+    }
+  }
+
+  in_port(L3TriggerQueue_in, TriggerMsg, L3triggerQueue, rank=6) {
+    if (L3TriggerQueue_in.isReady(clockEdge())) {
+      peek(L3TriggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == TriggerType:L3Hit) {
+          trigger(Event:L3Hit, in_msg.addr, entry, tbe);
+        } else {
+          error("Unknown trigger msg");
+        }
+      }
+    }
+  }
+
+  // Unblock Network
+  in_port(unblockNetwork_in, UnblockMsg, unblockFromCores, rank=5) {
+    if (unblockNetwork_in.isReady(clockEdge())) {
+      peek(unblockNetwork_in, UnblockMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        trigger(Event:CoreUnblock, in_msg.addr, entry, tbe);
+      }
+    }
+  }
+
+  // Core response network
+  in_port(responseNetwork_in, ResponseMsg, responseFromCores, rank=4) {
+    if (responseNetwork_in.isReady(clockEdge())) {
+      peek(responseNetwork_in, ResponseMsg) {
+          DPRINTF(RubySlicc, "core responses %s\n", in_msg);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+          if (is_valid(tbe) && tbe.NumPendingAcks == 1
+            && tbe.TriggeredAcksComplete == false) {
+            trigger(Event:LastCPUPrbResp, in_msg.addr, entry, tbe);
+          } else {
+            trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceResponseType:CPUData) {
+          trigger(Event:CPUData, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
+            trigger(Event:StaleWB, in_msg.addr, entry, tbe);
+        } else {
+          error("Unexpected response type");
+        }
+      }
+    }
+  }
+
+  // off-chip memory request/response is done
+  in_port(memQueue_in, MemoryMsg, responseFromMemory, rank=3) {
+    if (memQueue_in.isReady(clockEdge())) {
+      peek(memQueue_in, MemoryMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == MemoryRequestType:MEMORY_READ) {
+          trigger(Event:MemData, in_msg.addr, entry, tbe);
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+        } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) {
+          trigger(Event:WBAck, in_msg.addr, entry, tbe); // ignore WBAcks, don't care about them.
+        } else {
+          DPRINTF(RubySlicc, "%s\n", in_msg.Type);
+          error("Invalid message");
+        }
+      }
+    }
+  }
+
+  in_port(regBuf_in, CPURequestMsg, reqFromRegBuf, rank=2) {
+    if (regBuf_in.isReady(clockEdge())) {
+      peek(regBuf_in, CPURequestMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == CoherenceRequestType:ForceInv) {
+          trigger(Event:Inv, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:ForceDowngrade) {
+          trigger(Event:Downgrade, in_msg.addr, entry, tbe);
+        } else {
+          error("Bad request from region buffer");
+        }
+      }
+    }
+  }
+
+  in_port(regDir_in, CPURequestMsg, reqFromRegDir, rank=1) {
+    if (regDir_in.isReady(clockEdge())) {
+      peek(regDir_in, CPURequestMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == CoherenceRequestType:RdBlk) {
+          trigger(Event:RdBlk, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+          trigger(Event:RdBlkS, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+          trigger(Event:RdBlkM, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+          trigger(Event:Atomic, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+          trigger(Event:WriteThrough, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+          if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+            DPRINTF(RubySlicc, "Dropping VicDirty for address %s\n", in_msg.addr);
+            trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+          } else {
+            trigger(Event:VicDirty, in_msg.addr, entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+          if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+            DPRINTF(RubySlicc, "Dropping VicClean for address %s\n", in_msg.addr);
+            trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+          } else {
+            trigger(Event:VicClean, in_msg.addr, entry, tbe);
+          }
+        } else {
+          error("Bad message type fwded from Region Dir");
+        }
+      }
+    }
+  }
+
+  in_port(requestNetwork_in, CPURequestMsg, requestFromCores, rank=0) {
+    if (requestNetwork_in.isReady(clockEdge())) {
+      peek(requestNetwork_in, CPURequestMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Private) {
+          // Bypass the region dir
+          if (in_msg.Type == CoherenceRequestType:RdBlk) {
+            trigger(Event:RdBlkP, in_msg.addr, entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+            trigger(Event:RdBlkSP, in_msg.addr, entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+            trigger(Event:RdBlkMP, in_msg.addr, entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+            trigger(Event:AtomicP, in_msg.addr, entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+            trigger(Event:WriteThroughP, in_msg.addr, entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+            if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+              DPRINTF(RubySlicc, "Dropping VicDirtyP for address %s\n", in_msg.addr);
+              trigger(Event:StaleVicDirtyP, in_msg.addr, entry, tbe);
+            } else {
+              DPRINTF(RubySlicc, "Got VicDirty from %s on %s\n", in_msg.Requestor, in_msg.addr);
+              trigger(Event:VicDirtyP, in_msg.addr, entry, tbe);
+            }
+          } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+            if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+              DPRINTF(RubySlicc, "Dropping VicCleanP for address %s\n", in_msg.addr);
+              trigger(Event:StaleVicDirtyP, in_msg.addr, entry, tbe);
+            } else {
+              DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr);
+              trigger(Event:VicCleanP, in_msg.addr, entry, tbe);
+            }
+          } else {
+            error("Bad message type for private access");
+          }
+        } else {
+          trigger(Event:CPUReq, in_msg.addr, entry, tbe);
+        }
+      }
+    }
+  }
+
+  // Actions
+  action(s_sendResponseS, "s", desc="send Shared response") {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(tbe.OriginalRequestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := false;
+      out_msg.State := CoherenceState:Shared;
+      out_msg.InitialRequestTime := tbe.InitialRequestTime;
+      out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+      out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.DemandRequest := tbe.DemandRequest;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(es_sendResponseES, "es", desc="send Exclusive or Shared response") {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(tbe.OriginalRequestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Cached) {
+        out_msg.State := CoherenceState:Shared;
+      } else {
+        out_msg.State := CoherenceState:Exclusive;
+      }
+      out_msg.InitialRequestTime := tbe.InitialRequestTime;
+      out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+      out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.DemandRequest := tbe.DemandRequest;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(m_sendResponseM, "m", desc="send Modified response") {
+    if (tbe.wtData) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:UnblockWriteThrough;
+      }
+    } else {
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        if (tbe.L3Hit) {
+          out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+        } else {
+          out_msg.Sender := machineID;
+        }
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := tbe.Dirty;
+        out_msg.State := CoherenceState:Modified;
+        out_msg.CtoD := false;
+        out_msg.InitialRequestTime := tbe.InitialRequestTime;
+        out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+        out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+        out_msg.OriginalResponder := tbe.LastSender;
+        out_msg.DemandRequest := tbe.DemandRequest;
+        out_msg.L3Hit := tbe.L3Hit;
+        if (tbe.atomicData) {
+          out_msg.WTRequestor := tbe.WTRequestor;
+        }
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+      if (tbe.atomicData) {
+        enqueue(triggerQueue_out, TriggerMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := TriggerType:UnblockWriteThrough;
+        }
+      }
+    }
+  }
+
+  action(sb_sendResponseSBypass, "sb", desc="send Shared response") {
+    peek(requestNetwork_in, CPURequestMsg) {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(in_msg.Requestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := false;
+      out_msg.State := CoherenceState:Shared;
+      out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+      out_msg.ForwardRequestTime := curCycle();
+      out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.DemandRequest := false;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+    }
+  }
+
+  action(esb_sendResponseESBypass, "esb", desc="send Exclusive or Shared response") {
+    peek(requestNetwork_in, CPURequestMsg) {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(in_msg.Requestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Cached || in_msg.ForceShared) {
+        out_msg.State := CoherenceState:Shared;
+      } else {
+        out_msg.State := CoherenceState:Exclusive;
+      }
+      out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+      out_msg.ForwardRequestTime := curCycle();
+      out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.DemandRequest := false;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+    }
+  }
+
+  action(mbwt_sendResponseWriteThroughBypass, "mbwt", desc="send write through response") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:NBSysWBAck;
+          out_msg.Destination.add(in_msg.Requestor);
+          out_msg.WTRequestor := in_msg.WTRequestor;
+          out_msg.Sender := machineID;
+          out_msg.MessageSize := MessageSizeType:Writeback_Control;
+          out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+          out_msg.ForwardRequestTime := curCycle();
+          out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+          out_msg.DemandRequest := false;
+        }
+      } else {
+        assert(in_msg.Type == CoherenceRequestType:Atomic);
+        enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:NBSysResp;
+          if (tbe.L3Hit) {
+            out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+          } else {
+            out_msg.Sender := machineID;
+          }
+          out_msg.Destination.add(in_msg.Requestor);
+          out_msg.DataBlk := getDirectoryEntry(address).DataBlk;
+          out_msg.MessageSize := MessageSizeType:Response_Data;
+          out_msg.Dirty := in_msg.Dirty;
+          out_msg.State := CoherenceState:Modified;
+          out_msg.CtoD := false;
+          out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+          out_msg.ForwardRequestTime := curCycle();
+          out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+          out_msg.OriginalResponder := tbe.LastSender;
+          out_msg.DemandRequest := false;
+          out_msg.L3Hit := tbe.L3Hit;
+          out_msg.WTRequestor := in_msg.WTRequestor;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+      }
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:UnblockWriteThrough;
+      }
+    }
+  }
+
+  action(mb_sendResponseMBypass, "mb", desc="send Modified response") {
+    peek(requestNetwork_in, CPURequestMsg) {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(in_msg.Requestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := tbe.Dirty;
+      out_msg.State := CoherenceState:Modified;
+      out_msg.CtoD := false;
+      out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+      out_msg.ForwardRequestTime := curCycle();
+      out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.DemandRequest := false;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+    }
+  }
+
+  action(c_sendResponseCtoD, "c", desc="send CtoD Ack") {
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        out_msg.Dirty := false;
+        out_msg.State := CoherenceState:Modified;
+        out_msg.CtoD := true;
+        out_msg.InitialRequestTime := tbe.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+        out_msg.DemandRequest := tbe.DemandRequest;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+  }
+
+  action(cp_sendResponseCtoDP, "cp", desc="send CtoD Ack") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        out_msg.Dirty := false;
+        out_msg.State := CoherenceState:Modified;
+        out_msg.CtoD := true;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+        out_msg.DemandRequest := false;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+    peek(regDir_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysWBAck;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+        out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+        out_msg.DemandRequest := false;
+      }
+    }
+  }
+
+  action(wp_sendResponseWBAckP, "wp", desc="send WB Ack") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysWBAck;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+        out_msg.DemandRequest := false;
+      }
+    }
+  }
+
+  action(wc_sendResponseWBAck, "wc", desc="send WB Ack for cancel") {
+    peek(responseNetwork_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysWBAck;
+        out_msg.Destination.add(in_msg.Sender);
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+      }
+    }
+  }
+
+  action(ra_ackRegionDir, "ra", desc="Ack region dir") {
+    peek(regDir_in, CPURequestMsg) {
+      if (in_msg.NoAckNeeded == false) {
+        enqueue(responseNetwork_out, ResponseMsg, response_latency_regionDir) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:DirReadyAck;
+          out_msg.Destination.add(map_Address_to_RegionDir(address));
+          out_msg.Sender := machineID;
+          out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        }
+      }
+    }
+  }
+
+  action(l_queueMemRdReq, "lr", desc="Read data from memory") {
+    peek(regDir_in, CPURequestMsg) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) {
+          out_msg.addr := address;
+          out_msg.Type := TriggerType:L3Hit;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        tbe.DataBlk := entry.DataBlk;
+        tbe.LastSender := entry.LastSender;
+        tbe.L3Hit := true;
+        tbe.MemData := true;
+        DPRINTF(RubySlicc, "L3 data is %s\n", entry.DataBlk);
+        L3CacheMemory.deallocate(address);
+      } else {
+        queueMemoryRead(machineID, address, to_memory_controller_latency);
+      }
+    }
+  }
+
+  action(lrp_queueMemRdReqP, "lrp", desc="Read data from memory") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) {
+          out_msg.addr := address;
+          out_msg.Type := TriggerType:L3Hit;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        tbe.DataBlk := entry.DataBlk;
+        tbe.LastSender := entry.LastSender;
+        tbe.L3Hit := true;
+        tbe.MemData := true;
+        DPRINTF(RubySlicc, "L3 data is %s\n", entry.DataBlk);
+        L3CacheMemory.deallocate(address);
+      } else {
+        queueMemoryRead(machineID, address, to_memory_controller_latency);
+      }
+    }
+  }
+
+  action(dcr_probeInvCoreData, "dcr", desc="probe inv cores, return data") {
+    peek(regBuf_in, CPURequestMsg) {
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbInv;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination := in_msg.Sharers;
+        tbe.NumPendingAcks := tbe.NumPendingAcks + in_msg.Sharers.count();
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+        APPEND_TRANSITION_COMMENT(" dcr: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(ddr_probeDownCoreData, "ddr", desc="probe inv cores, return data") {
+    peek(regBuf_in, CPURequestMsg) {
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbDowngrade;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination := in_msg.Sharers;
+        tbe.NumPendingAcks := tbe.NumPendingAcks + in_msg.Sharers.count();
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+        APPEND_TRANSITION_COMMENT(" dcr: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") {
+    peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbDowngrade;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination.broadcast(MachineType:CorePair);  // won't be realistic for multisocket
+        tbe.NumPendingAcks := tbe.NumPendingAcks +machineCount(MachineType:CorePair) - 1;
+        out_msg.Destination.broadcast(MachineType:TCP);
+        tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:TCP);
+        out_msg.Destination.broadcast(MachineType:SQC);
+        tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:SQC);
+        out_msg.Destination.remove(in_msg.Requestor);
+        DPRINTF(RubySlicc, "%s\n", (out_msg));
+        APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(ic_probeInvCore, "ic", desc="probe invalidate core, no return data needed") {
+    peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbInv;
+        out_msg.ReturnData := false;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination.broadcast(MachineType:CorePair);  // won't be realistic for multisocket
+        tbe.NumPendingAcks := tbe.NumPendingAcks +machineCount(MachineType:CorePair) - 1;
+        out_msg.Destination.broadcast(MachineType:TCP);
+        tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:TCP);
+        out_msg.Destination.broadcast(MachineType:SQC);
+        tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:SQC);
+        out_msg.Destination.remove(in_msg.Requestor);
+        APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(d_writeDataToMemory, "d", desc="Write data to memory") {
+    peek(responseNetwork_in, ResponseMsg) {
+      getDirectoryEntry(address).DataBlk := in_msg.DataBlk;
+      DPRINTF(RubySlicc, "Writing Data: %s to address %s\n", in_msg.DataBlk,
+              in_msg.addr);
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    peek(regDir_in, CPURequestMsg) {
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.writeMask.clear();
+        tbe.writeMask.orMask(in_msg.writeMask);
+        tbe.wtData := true;
+        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.LastSender := in_msg.Requestor;
+      }
+      if (in_msg.Type == CoherenceRequestType:Atomic) {
+        tbe.writeMask.clear();
+        tbe.writeMask.orMask(in_msg.writeMask);
+        tbe.atomicData := true;
+        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.LastSender := in_msg.Requestor;
+      }
+      tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs
+      tbe.Dirty := false;
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.DataBlk.copyPartial(in_msg.DataBlk,tbe.writeMask);
+        tbe.Dirty := false;
+      }
+      tbe.OriginalRequestor := in_msg.Requestor;
+      tbe.NumPendingAcks := 0;
+      tbe.Cached := in_msg.ForceShared;
+      tbe.InitialRequestTime := in_msg.InitialRequestTime;
+      tbe.ForwardRequestTime := curCycle();
+      tbe.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+      tbe.DemandRequest := in_msg.DemandRequest;
+    }
+  }
+
+  action(tp_allocateTBEP, "tp", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    peek(requestNetwork_in, CPURequestMsg) {
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.writeMask.clear();
+        tbe.writeMask.orMask(in_msg.writeMask);
+        tbe.wtData := true;
+        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.LastSender := in_msg.Requestor;
+      }
+      if (in_msg.Type == CoherenceRequestType:Atomic) {
+        tbe.writeMask.clear();
+        tbe.writeMask.orMask(in_msg.writeMask);
+        tbe.atomicData := true;
+        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.LastSender := in_msg.Requestor;
+      }
+      tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs
+      tbe.Dirty := false;
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.DataBlk.copyPartial(in_msg.DataBlk,tbe.writeMask);
+        tbe.Dirty := false;
+      }
+      tbe.OriginalRequestor := in_msg.Requestor;
+      tbe.NumPendingAcks := 0;
+      tbe.Cached := in_msg.ForceShared;
+      tbe.InitialRequestTime := in_msg.InitialRequestTime;
+      tbe.ForwardRequestTime := curCycle();
+      tbe.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+      tbe.DemandRequest := false;
+    }
+  }
+
+  action(sa_setAcks, "sa", desc="setAcks") {
+    peek(regDir_in, CPURequestMsg) {
+        tbe.NumPendingAcks := in_msg.Acks;
+        APPEND_TRANSITION_COMMENT(" waiting for acks ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+    }
+  }
+
+  action(tr_allocateTBE, "tr", desc="allocate TBE Entry for Region inv") {
+    check_allocate(TBEs);
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+    tbe.NumPendingAcks := 0;
+  }
+
+  action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(wdp_writeBackDataPrivate, "wdp", desc="Write back data if needed") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.DataBlkAux := getDirectoryEntry(address).DataBlk;
+        tbe.DataBlkAux.copyPartial(in_msg.DataBlk,in_msg.writeMask);
+        getDirectoryEntry(address).DataBlk := tbe.DataBlkAux;
+      } else{
+        assert(in_msg.Type == CoherenceRequestType:Atomic);
+        tbe.DataBlkAux.atomicPartial(getDirectoryEntry(address).DataBlk,in_msg.writeMask);
+        getDirectoryEntry(address).DataBlk := tbe.DataBlkAux;
+      }
+    }
+  }
+
+  action(wd_writeBackData, "wd", desc="Write back data if needed") {
+    if (tbe.wtData) {
+      DataBlock tmp := getDirectoryEntry(address).DataBlk;
+      tmp.copyPartial(tbe.DataBlk,tbe.writeMask);
+      tbe.DataBlk := tmp;
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    } else if (tbe.atomicData) {
+      tbe.DataBlk.atomicPartial(getDirectoryEntry(address).DataBlk,tbe.writeMask);
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    } else if (tbe.Dirty == true) {
+      APPEND_TRANSITION_COMMENT(" Wrote data back ");
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    }
+  }
+
+  action(wdi_writeBackDataInv, "wdi", desc="Write back inv data if needed") {
+    // Kind of opposite from above...?
+    if (tbe.Dirty == true) {
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+      APPEND_TRANSITION_COMMENT("Writing dirty data to dir");
+      DPRINTF(RubySlicc, "Data %s: %s\n", address, tbe.DataBlk);
+    } else {
+      APPEND_TRANSITION_COMMENT("NOT!!! Writing dirty data to dir");
+    }
+  }
+
+  action(wdt_writeBackDataInvNoTBE, "wdt", desc="Write back inv data if needed no TBE") {
+    // Kind of opposite from above...?
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.Dirty == true) {
+        getDirectoryEntry(address).DataBlk := in_msg.DataBlk;
+        APPEND_TRANSITION_COMMENT("Writing dirty data to dir");
+        DPRINTF(RubySlicc, "Data %s: %s\n", address, in_msg.DataBlk);
+      } else {
+        APPEND_TRANSITION_COMMENT("NOT!!! Writing dirty data to dir");
+      }
+    }
+  }
+
+  action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") {
+    peek(memQueue_in, MemoryMsg) {
+      if (tbe.Dirty == false) {
+        tbe.DataBlk := getDirectoryEntry(address).DataBlk;
+      }
+      tbe.MemData := true;
+    }
+  }
+
+  action(ml_writeL3DataToTBE, "ml", desc="write L3 data to TBE") {
+    assert(tbe.Dirty == false);
+    CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+    tbe.DataBlk := entry.DataBlk;
+    tbe.LastSender := entry.LastSender;
+    tbe.L3Hit := true;
+    tbe.MemData := true;
+  }
+
+  action(y_writeProbeDataToTBE, "y", desc="write Probe Data to TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.Dirty) {
+        DPRINTF(RubySlicc, "Got dirty data for %s from %s\n", address, in_msg.Sender);
+        DPRINTF(RubySlicc, "Data is %s\n", in_msg.DataBlk);
+        if (tbe.wtData) {
+          DataBlock tmp := in_msg.DataBlk;
+          tmp.copyPartial(tbe.DataBlk,tbe.writeMask);
+          tbe.DataBlk := tmp;
+        } else if (tbe.Dirty) {
+          if(tbe.atomicData == false && tbe.wtData == false) {
+            DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender);
+            assert(tbe.DataBlk == in_msg.DataBlk);  // in case of double data
+          }
+        } else {
+          tbe.DataBlk := in_msg.DataBlk;
+          tbe.Dirty := in_msg.Dirty;
+          tbe.LastSender := in_msg.Sender;
+        }
+      }
+      if (in_msg.Hit) {
+        tbe.Cached := true;
+      }
+    }
+  }
+
+  action(yc_writeCPUDataToTBE, "yc", desc="write CPU Data to TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.Dirty) {
+        DPRINTF(RubySlicc, "Got dirty data for %s from %s\n", address, in_msg.Sender);
+        DPRINTF(RubySlicc, "Data is %s\n", in_msg.DataBlk);
+        if (tbe.Dirty) {
+          DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender);
+          assert(tbe.DataBlk == in_msg.DataBlk);  // in case of double data
+        }
+        tbe.DataBlk := in_msg.DataBlk;
+        tbe.Dirty := false;
+        tbe.LastSender := in_msg.Sender;
+      }
+    }
+  }
+
+  action(x_decrementAcks, "x", desc="decrement Acks pending") {
+    if (tbe.NumPendingAcks > 0) {
+      tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+    } else {
+      APPEND_TRANSITION_COMMENT(" Double ack! ");
+    }
+    assert(tbe.NumPendingAcks >= 0);
+    APPEND_TRANSITION_COMMENT(" Acks remaining: ");
+    APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+  }
+
+  action(o_checkForCompletion, "o", desc="check for ack completion") {
+    if (tbe.NumPendingAcks == 0 && tbe.TriggeredAcksComplete == false) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:AcksComplete;
+      }
+      tbe.TriggeredAcksComplete := true;
+    }
+    APPEND_TRANSITION_COMMENT(" Check: Acks remaining: ");
+    APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+  }
+
+  action(ont_checkForCompletionNoTrigger, "ont", desc="check for ack completion, no trigger") {
+    if (tbe.NumPendingAcks == 0 && tbe.TriggeredAcksComplete == false) {
+      tbe.TriggeredAcksComplete := true;
+    }
+    APPEND_TRANSITION_COMMENT(" Check: Acks remaining: ");
+    APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+  }
+
+  action(rvp_removeVicDirtyIgnore, "rvp", desc="Remove ignored core") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor);
+    }
+  }
+
+  action(rv_removeVicDirtyIgnore, "rv", desc="Remove ignored core") {
+    peek(regDir_in, CPURequestMsg) {
+      getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor);
+    }
+  }
+
+  action(r_sendRequestToRegionDir, "r", desc="send request to Region Directory") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(requestNetworkReg_out, CPURequestMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := in_msg.Type;
+        out_msg.Requestor := in_msg.Requestor;
+        out_msg.Destination.add(map_Address_to_RegionDir(address));
+        out_msg.Shared := in_msg.Shared;
+        out_msg.MessageSize := in_msg.MessageSize;
+        DPRINTF(RubySlicc, "out dest: %s\n", map_Address_to_RegionDir(address));
+      }
+    }
+  }
+
+  action(ai_ackInvalidate, "ai", desc="Ack to let the reg-dir know that the inv is ordered") {
+    peek(regBuf_in, CPURequestMsg) {
+      enqueue(regAckNetwork_out, UnblockMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        DPRINTF(RubySlicc, "ai out_msg: %s\n", out_msg);
+      }
+    }
+  }
+
+  action(aic_ackInvalidate, "aic", desc="Ack to let the reg-dir know that the inv is ordered") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.NoAckNeeded == false) {
+        enqueue(regAckNetwork_out, UnblockMsg, 1) {
+          out_msg.addr := address;
+          if (machineIDToMachineType(in_msg.Sender) == MachineType:CorePair) {
+            out_msg.Destination.add(createMachineID(MachineType:RegionBuffer, intToID(0)));
+          } else {
+            out_msg.Destination.add(createMachineID(MachineType:RegionBuffer, intToID(1)));
+          }
+          out_msg.MessageSize := MessageSizeType:Response_Control;
+          DPRINTF(RubySlicc, "ai out_msg: %s\n", out_msg);
+          out_msg.wasValid := in_msg.isValid;
+        }
+      }
+    }
+  }
+
+  action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+        entry.DataBlk := in_msg.DataBlk;
+        entry.LastSender := in_msg.Sender;
+      } else {
+        if (L3CacheMemory.cacheAvail(address) == false) {
+          Addr victim := L3CacheMemory.cacheProbe(address);
+          CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+                                                 L3CacheMemory.lookup(victim));
+          queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+                           victim_entry.DataBlk);
+          L3CacheMemory.deallocate(victim);
+        }
+        assert(L3CacheMemory.cacheAvail(address));
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+        entry.DataBlk := in_msg.DataBlk;
+        entry.LastSender := in_msg.Sender;
+      }
+    }
+  }
+
+  action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") {
+    if ((tbe.wtData || tbe.atomicData) && useL3OnWT) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+        entry.DataBlk := tbe.DataBlk;
+        entry.LastSender := tbe.LastSender;
+      } else {
+        if (L3CacheMemory.cacheAvail(address) == false) {
+          Addr victim := L3CacheMemory.cacheProbe(address);
+          CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+                                                 L3CacheMemory.lookup(victim));
+          queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+                           victim_entry.DataBlk);
+          L3CacheMemory.deallocate(victim);
+        }
+        assert(L3CacheMemory.cacheAvail(address));
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+        entry.DataBlk := tbe.DataBlk;
+        entry.LastSender := tbe.LastSender;
+      }
+    }
+  }
+
+  action(ali_allocateL3Block, "ali", desc="allocate the L3 block on ForceInv") {
+    if (tbe.Dirty == true) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+        entry.DataBlk := tbe.DataBlk;
+        entry.LastSender := tbe.LastSender;
+      } else {
+        if (L3CacheMemory.cacheAvail(address) == false) {
+          Addr victim := L3CacheMemory.cacheProbe(address);
+          CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+                                                 L3CacheMemory.lookup(victim));
+          queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+                           victim_entry.DataBlk);
+          L3CacheMemory.deallocate(victim);
+        }
+        assert(L3CacheMemory.cacheAvail(address));
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+        entry.DataBlk := tbe.DataBlk;
+        entry.LastSender := tbe.LastSender;
+      }
+    }
+  }
+
+  action(ali_allocateL3BlockNoTBE, "alt", desc="allocate the L3 block on ForceInv no TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.Dirty) {
+        if (L3CacheMemory.isTagPresent(address)) {
+          CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+          APPEND_TRANSITION_COMMENT(" ali wrote data to L3 (hit) ");
+          entry.DataBlk := in_msg.DataBlk;
+          entry.LastSender := in_msg.Sender;
+        } else {
+          if (L3CacheMemory.cacheAvail(address) == false) {
+            Addr victim := L3CacheMemory.cacheProbe(address);
+            CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+                                                   L3CacheMemory.lookup(victim));
+            queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+                             victim_entry.DataBlk);
+            L3CacheMemory.deallocate(victim);
+          }
+          assert(L3CacheMemory.cacheAvail(address));
+          CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+          APPEND_TRANSITION_COMMENT(" ali wrote data to L3 ");
+          entry.DataBlk := in_msg.DataBlk;
+          entry.LastSender := in_msg.Sender;
+        }
+      }
+    }
+  }
+
+  action(dl_deallocateL3, "dl", desc="deallocate the L3 block") {
+    L3CacheMemory.deallocate(address);
+  }
+
+  action(p_popRequestQueue, "p", desc="pop request queue") {
+    requestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(prd_popRegionQueue, "prd", desc="pop request queue") {
+    regDir_in.dequeue(clockEdge());
+  }
+
+  action(prb_popRegionBufQueue, "prb", desc="pop request queue") {
+    regBuf_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="pop response queue") {
+    responseNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pm_popMemQueue, "pm", desc="pop mem queue") {
+    memQueue_in.dequeue(clockEdge());
+  }
+
+  action(pt_popTriggerQueue, "pt", desc="pop trigger queue") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(ptl_popTriggerQueue, "ptl", desc="pop L3 trigger queue") {
+    L3TriggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(pu_popUnblockQueue, "pu", desc="pop unblock queue") {
+    unblockNetwork_in.dequeue(clockEdge());
+  }
+
+  action(yy_recycleResponseQueue, "yy", desc="recycle response queue") {
+    responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(ww_stallAndWaitRegRequestQueue, "ww", desc="recycle region dir request queue") {
+    stall_and_wait(regDir_in, address);
+  }
+
+  action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") {
+    stall_and_wait(requestNetwork_in, address);
+  }
+
+  action(wa_wakeUpDependents, "wa", desc="Wake up any requests waiting for this address") {
+    wakeUpBuffers(address);
+  }
+
+  action(wa_wakeUpAllDependents, "waa", desc="Wake up any requests waiting for this region") {
+    wakeUpAllBuffers();
+  }
+
+  action(z_stall, "z", desc="...") {
+  }
+
+  // TRANSITIONS
+
+  // transitions from U
+
+  transition({BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {Inv, Downgrade}) {
+      ww_stallAndWaitRegRequestQueue;
+  }
+
+  transition(U, Inv, BI){L3TagArrayRead} {
+    tr_allocateTBE;
+    dcr_probeInvCoreData; // only need to invalidate sharers
+    ai_ackInvalidate;
+    prb_popRegionBufQueue;
+  }
+
+  transition(U, Downgrade, BI){L3TagArrayRead} {
+    tr_allocateTBE;
+    ddr_probeDownCoreData; // only need to invalidate sharers
+    ai_ackInvalidate;
+    prb_popRegionBufQueue;
+  }
+
+  // The next 2 transistions are needed in the event that an invalidation
+  // is waiting for its ack from the core, but the event makes it through
+  // the region directory before the acks. This wouldn't be needed if
+  // we waited to ack the region dir until the directory got all the acks
+  transition({BR, BW, BI, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, WriteThrough, Atomic}) {
+      ww_stallAndWaitRegRequestQueue;
+  }
+
+  transition({BR, BW, BI, BL, BS_M, BM_M, B_M, BS_PM, BM_PM, B_PM, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {RdBlkSP, RdBlkMP, RdBlkP}) {
+      st_stallAndWaitRequest;
+  }
+
+  transition({BR, BW, BI, BL, BS_M, BM_M, B_M, BS_PM, BM_PM, B_PM, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {WriteThroughP,AtomicP}) {
+      st_stallAndWaitRequest;
+  }
+
+  transition(U, {RdBlkS}, BS_PM) {L3TagArrayRead} {
+    t_allocateTBE;
+    l_queueMemRdReq;
+    sa_setAcks;
+    o_checkForCompletion;
+    ra_ackRegionDir;
+    prd_popRegionQueue;
+  }
+
+  transition(U, WriteThrough, BM_PM){L3TagArrayRead} {
+    t_allocateTBE;
+    w_sendResponseWBAck;
+    l_queueMemRdReq;
+    sa_setAcks;
+    o_checkForCompletion;
+    ra_ackRegionDir;
+    prd_popRegionQueue;
+  }
+
+  transition(U, {RdBlkM,Atomic}, BM_PM){L3TagArrayRead} {
+    t_allocateTBE;
+    l_queueMemRdReq;
+    sa_setAcks;
+    o_checkForCompletion;
+    ra_ackRegionDir;
+    prd_popRegionQueue;
+  }
+
+  transition(U, RdBlk, B_PM){L3TagArrayRead} {
+    t_allocateTBE;
+    l_queueMemRdReq;
+    sa_setAcks;
+    o_checkForCompletion;
+    ra_ackRegionDir;
+    prd_popRegionQueue;
+  }
+
+  transition(U, {RdBlkSP}, BS_M) {L3TagArrayRead} {
+    tp_allocateTBEP;
+    lrp_queueMemRdReqP;
+    p_popRequestQueue;
+  }
+
+  transition(U, WriteThroughP, BM_M) {L3TagArrayRead} {
+    tp_allocateTBEP;
+    wp_sendResponseWBAckP;
+    lrp_queueMemRdReqP;
+    p_popRequestQueue;
+  }
+
+  transition(U, {RdBlkMP,AtomicP}, BM_M) {L3TagArrayRead} {
+    tp_allocateTBEP;
+    lrp_queueMemRdReqP;
+    p_popRequestQueue;
+  }
+
+  transition(U, RdBlkP, B_M) {L3TagArrayRead} {
+    tp_allocateTBEP;
+    lrp_queueMemRdReqP;
+    p_popRequestQueue;
+  }
+
+  transition(U, VicDirtyP, BL) {L3TagArrayRead}  {
+    tp_allocateTBEP;
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(U, VicCleanP, BL) {L3TagArrayRead} {
+    tp_allocateTBEP;
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(BM_Pm, RdBlkSP, BM_Pm_B) {L3DataArrayWrite} {
+    sb_sendResponseSBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BS_Pm, RdBlkSP, BS_Pm_B) {L3DataArrayWrite} {
+    sb_sendResponseSBypass;
+    p_popRequestQueue;
+  }
+
+  transition(B_Pm, RdBlkSP, B_Pm_B) {L3DataArrayWrite} {
+    sb_sendResponseSBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BP, RdBlkSP, BP_B) {L3DataArrayWrite} {
+    sb_sendResponseSBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BM_Pm, RdBlkMP, BM_Pm_B) {L3DataArrayWrite} {
+    mb_sendResponseMBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BS_Pm, RdBlkMP, BS_Pm_B) {L3DataArrayWrite} {
+    mb_sendResponseMBypass;
+    p_popRequestQueue;
+  }
+
+  transition(B_Pm, RdBlkMP, B_Pm_B) {L3DataArrayWrite} {
+    mb_sendResponseMBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BP, RdBlkMP, BP_B) {L3DataArrayWrite} {
+    mb_sendResponseMBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BM_Pm, {WriteThroughP,AtomicP}, BM_Pm_B) {L3DataArrayWrite} {
+    wdp_writeBackDataPrivate;
+    mbwt_sendResponseWriteThroughBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BS_Pm, {WriteThroughP,AtomicP}, BS_Pm_B) {L3DataArrayWrite} {
+    wdp_writeBackDataPrivate;
+    mbwt_sendResponseWriteThroughBypass;
+    p_popRequestQueue;
+  }
+
+  transition(B_Pm, {WriteThroughP,AtomicP}, B_Pm_B) {L3DataArrayWrite} {
+    wdp_writeBackDataPrivate;
+    mbwt_sendResponseWriteThroughBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BP, {WriteThroughP,AtomicP}, BP_B) {L3DataArrayWrite} {
+    wdp_writeBackDataPrivate;
+    mbwt_sendResponseWriteThroughBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BM_Pm, RdBlkP, BM_Pm_B) {L3DataArrayWrite} {
+    esb_sendResponseESBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BS_Pm, RdBlkP, BS_Pm_B) {L3DataArrayWrite} {
+    esb_sendResponseESBypass;
+    p_popRequestQueue;
+  }
+
+  transition(B_Pm, RdBlkP, B_Pm_B)  {L3DataArrayWrite}{
+    esb_sendResponseESBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BP, RdBlkP, BP_B)  {L3DataArrayWrite}{
+    esb_sendResponseESBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BM_Pm_B, CoreUnblock, BM_Pm) {
+    wa_wakeUpDependents;
+    pu_popUnblockQueue;
+  }
+
+  transition(BS_Pm_B, CoreUnblock, BS_Pm) {
+    wa_wakeUpDependents;
+    pu_popUnblockQueue;
+  }
+
+  transition(B_Pm_B, CoreUnblock, B_Pm) {
+    wa_wakeUpDependents;
+    pu_popUnblockQueue;
+  }
+
+  transition(BP_B, CoreUnblock, BP) {
+    wa_wakeUpDependents;
+    pu_popUnblockQueue;
+  }
+
+  transition(BM_Pm_B, UnblockWriteThrough, BM_Pm) {
+    wa_wakeUpDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(BS_Pm_B, UnblockWriteThrough, BS_Pm) {
+    wa_wakeUpDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(B_Pm_B, UnblockWriteThrough, B_Pm) {
+    wa_wakeUpDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(BP_B, UnblockWriteThrough, BP) {
+    wa_wakeUpDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(BM_Pm, VicDirtyP, BM_Pm_BL) {
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(BS_Pm, VicDirtyP, BS_Pm_BL) {
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(B_Pm, VicDirtyP, B_Pm_BL) {
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(BP, VicDirtyP, BP_BL) {
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(BM_Pm, VicCleanP, BM_Pm_BL) {
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(BS_Pm, VicCleanP, BS_Pm_BL) {
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(B_Pm, VicCleanP, B_Pm_BL) {
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(BP, VicCleanP, BP_BL) {
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(BM_Pm_BL, CPUData, BM_Pm) {
+    yc_writeCPUDataToTBE;
+    d_writeDataToMemory;
+    wa_wakeUpDependents;
+    pr_popResponseQueue;
+  }
+
+  transition(BS_Pm_BL, CPUData, BS_Pm) {
+    yc_writeCPUDataToTBE;
+    d_writeDataToMemory;
+    wa_wakeUpDependents;
+    pr_popResponseQueue;
+  }
+
+  transition(B_Pm_BL, CPUData, B_Pm) {
+    yc_writeCPUDataToTBE;
+    d_writeDataToMemory;
+    wa_wakeUpDependents;
+    pr_popResponseQueue;
+  }
+
+  transition(BP_BL, CPUData, BP) {
+    yc_writeCPUDataToTBE;
+    d_writeDataToMemory;
+    wa_wakeUpDependents;
+    pr_popResponseQueue;
+  }
+
+  transition({BR, BW, BL}, {VicDirtyP, VicCleanP}) {
+      st_stallAndWaitRequest;
+  }
+
+  transition({BR, BW, BL}, {VicDirty, VicClean}) {
+      ww_stallAndWaitRegRequestQueue;
+  }
+
+  transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} {
+    dt_deallocateTBE;
+    d_writeDataToMemory;
+    al_allocateL3Block;
+    wa_wakeUpDependents;
+    pr_popResponseQueue;
+  }
+
+  transition(BL, StaleWB, U) {L3TagArrayWrite} {
+    dt_deallocateTBE;
+    wa_wakeUpAllDependents;
+    pr_popResponseQueue;
+  }
+
+  transition({BI, B, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {VicDirty, VicClean}) {
+    ww_stallAndWaitRegRequestQueue;
+  }
+
+  transition({BI, B, BS_M, BM_M, B_M, BS_PM, BM_PM, B_PM, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {VicDirtyP, VicCleanP}) {
+      st_stallAndWaitRequest;
+  }
+
+  transition({U, BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, WBAck) {
+    pm_popMemQueue;
+  }
+
+  transition({U, BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, StaleVicDirtyP) {
+    rvp_removeVicDirtyIgnore;
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition({U, BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, StaleVicDirty) {
+    rv_removeVicDirtyIgnore;
+    w_sendResponseWBAck;
+    prd_popRegionQueue;
+  }
+
+  transition(U, VicDirty, BL) {L3TagArrayRead} {
+      t_allocateTBE;
+      ra_ackRegionDir;
+      w_sendResponseWBAck;
+      prd_popRegionQueue;
+  }
+
+  transition(U, VicClean, BL) {L3TagArrayRead} {
+      t_allocateTBE;
+      ra_ackRegionDir;
+      w_sendResponseWBAck;
+      prd_popRegionQueue;
+  }
+
+  transition({B, BR}, CoreUnblock, U) {
+    wa_wakeUpDependents;
+    pu_popUnblockQueue;
+  }
+
+  transition({B, BR}, UnblockWriteThrough, U) {
+    wa_wakeUpDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(BS_M, MemData, B) {L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BM_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(B_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BS_PM, MemData, BS_Pm) {} {
+    mt_writeMemDataToTBE;
+    wa_wakeUpDependents;
+    pm_popMemQueue;
+  }
+
+  transition(BM_PM, MemData, BM_Pm){} {
+    mt_writeMemDataToTBE;
+    wa_wakeUpDependents;
+    pm_popMemQueue;
+  }
+
+  transition(B_PM, MemData, B_Pm){} {
+    mt_writeMemDataToTBE;
+    wa_wakeUpDependents;
+    pm_popMemQueue;
+  }
+
+  transition(BS_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} {
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition(BM_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} {
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition(B_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} {
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition(BS_PM, L3Hit, BS_Pm) {
+    wa_wakeUpDependents;
+    ptl_popTriggerQueue;
+  }
+
+  transition(BM_PM, L3Hit, BM_Pm) {
+    wa_wakeUpDependents;
+    ptl_popTriggerQueue;
+  }
+
+  transition(B_PM, L3Hit, B_Pm) {
+    wa_wakeUpDependents;
+    ptl_popTriggerQueue;
+  }
+
+  transition({BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, BP, BI}, CPUPrbResp) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    pr_popResponseQueue;
+  }
+
+  transition({B, B_M, BS_M, BM_M}, {CPUPrbResp, LastCPUPrbResp}) {
+    z_stall;
+  }
+
+  transition({BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {CPUPrbResp, LastCPUPrbResp}) {
+    // recycling because PrbResponse and data come on the same network
+    yy_recycleResponseQueue;
+  }
+
+  transition(U, {CPUPrbResp, LastCPUPrbResp}) {L3TagArrayRead, L3DataArrayWrite} {
+    aic_ackInvalidate;
+    wdt_writeBackDataInvNoTBE;
+    ali_allocateL3BlockNoTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(BL, {CPUPrbResp, LastCPUPrbResp}) {} {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    wdi_writeBackDataInv;
+    ali_allocateL3Block;
+    pr_popResponseQueue;
+  }
+
+  transition(BS_PM, LastCPUPrbResp, BS_M) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    pr_popResponseQueue;
+  }
+
+  transition(BS_PM, ProbeAcksComplete, BS_M) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(BM_PM, LastCPUPrbResp, BM_M) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    pr_popResponseQueue;
+  }
+
+  transition(BM_PM, ProbeAcksComplete, BM_M) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(B_PM, LastCPUPrbResp, B_M) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    pr_popResponseQueue;
+  }
+
+  transition(B_PM, ProbeAcksComplete, B_M){} {
+    pt_popTriggerQueue;
+  }
+
+  transition(BS_Pm, LastCPUPrbResp, B) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    ali_allocateL3Block;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(BS_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    ali_allocateL3Block;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(BM_Pm, LastCPUPrbResp, B) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    ali_allocateL3Block;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(BM_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    ali_allocateL3Block;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(B_Pm, LastCPUPrbResp, B) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    ali_allocateL3Block;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(B_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    ali_allocateL3Block;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(BP, LastCPUPrbResp, B) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    c_sendResponseCtoD;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(BP, ProbeAcksComplete, B){L3TagArrayWrite, L3TagArrayWrite} {
+    c_sendResponseCtoD;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(BI, LastCPUPrbResp, B) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    wa_wakeUpDependents;
+    wdi_writeBackDataInv;
+    ali_allocateL3Block;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(BI, ProbeAcksComplete, U) {L3TagArrayWrite, L3DataArrayWrite}{
+    wa_wakeUpDependents;
+    wdi_writeBackDataInv;
+    ali_allocateL3Block;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base-Region-msg.sm b/src/mem/protocol/MOESI_AMD_Base-Region-msg.sm
new file mode 100644
index 000000000..823933e57
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-Region-msg.sm
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+enumeration(CoherenceRequestType, desc="Coherence Request Types") {
+  // CPU Request Types ONLY
+  RdBlk,        desc="Read Blk";
+  RdBlkM,       desc="Read Blk Modified";
+  RdBlkS,       desc="Read Blk Shared";
+  VicClean,     desc="L2 clean eviction";
+  VicDirty,     desc="L2 dirty eviction";
+
+  WrCancel,     desc="want to cancel WB to Memory"; // should this be here?
+
+  WBApproval,   desc="WB Approval";
+
+  // Messages between Dir and R-Dir
+  ForceInv,     desc="Send invalide to the block";
+  ForceDowngrade, desc="Send downgrade to the block";
+  Unblock,      desc="Used to let the dir know a message has been sunk";
+
+  // Messages between R-Dir and R-Buffer
+  PrivateNotify, desc="Let region buffer know it has private access";
+  SharedNotify,  desc="Let region buffer know it has shared access";
+  WbNotify,      desc="Let region buffer know it saw its wb request";
+  Downgrade,     desc="Force the region buffer to downgrade to shared";
+  // Response to R-Dir (probably should be on a different network, but
+  // I need it to be ordered with respect to requests)
+  InvAck,       desc="Let the R-Dir know when the inv has occured";
+
+  PrivateRequest, desc="R-buf wants the region in private";
+  UpgradeRequest, desc="R-buf wants the region in private";
+  SharedRequest,  desc="R-buf wants the region in shared (could respond with private)";
+  CleanWbRequest, desc="R-buf wants to deallocate clean region";
+
+  NA,             desc="So we don't get segfaults";
+}
+
+enumeration(ProbeRequestType, desc="Probe Request Types") {
+  PrbDowngrade,    desc="Probe for Status";  // EtoS, MtoO, StoS
+  PrbInv,       desc="Probe to Invalidate";
+
+  // For regions
+  PrbRepl,      desc="Force the cache to do a replacement";
+  PrbRegDowngrade, desc="Probe for Status";  // EtoS, MtoO, StoS
+}
+
+
+enumeration(CoherenceResponseType, desc="Coherence Response Types") {
+  NBSysResp,       desc="Northbridge response to CPU Rd request";
+  NBSysWBAck,      desc="Northbridge response ok to WB";
+  TDSysResp,       desc="TCCdirectory response to CPU Rd request";
+  TDSysWBAck,      desc="TCCdirectory response ok to WB";
+  TDSysWBNack,      desc="TCCdirectory response ok to drop";
+  CPUPrbResp,      desc="CPU Probe Response";
+  CPUData,         desc="CPU Data";
+  StaleNotif,      desc="Notification of Stale WBAck, No data to writeback";
+  CPUCancelWB,     desc="want to cancel WB to Memory";
+  MemData,         desc="Data from Memory";
+
+  // for regions
+  PrivateAck,      desc="Ack that r-buf received private notify";
+  RegionWbAck, desc="Writeback Ack that r-buf completed deallocation";
+  DirReadyAck, desc="Directory (mem ctrl)<->region dir handshake";
+}
+
+enumeration(CoherenceState, default="CoherenceState_NA", desc="Coherence State") {
+  Modified,             desc="Modified";
+  Owned,                desc="Owned state";
+  Exclusive,            desc="Exclusive";
+  Shared,               desc="Shared";
+  NA,                   desc="NA";
+}
+
+structure(CPURequestMsg, desc="...", interface="Message") {
+  Addr addr,             desc="Physical address for this request";
+  Addr DemandAddress,       desc="Physical block address for this request";
+  CoherenceRequestType Type,   desc="Type of request";
+  DataBlock DataBlk,           desc="data for the cache line";  // only for WB
+  bool Dirty,                   desc="whether WB data is dirty";  // only for WB
+  MachineID Requestor,            desc="Node who initiated the request";
+  NetDest Destination,             desc="Multicast destination mask";
+  bool Shared,                  desc="For CPU_WrVicBlk, vic is O not M.  For CPU_ClVicBlk, vic is S";
+  MessageSizeType MessageSize, desc="size category of the message";
+  Cycles InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache";
+  Cycles ForwardRequestTime, default="0", desc="time the dir forwarded the request";
+  Cycles ProbeRequestStartTime, default="0", desc="the time the dir started the probe request";
+  bool DemandRequest, default="false", desc="For profiling purposes";
+
+  NetDest Sharers,              desc="Caches that may have a valid copy of the data";
+  bool ForceShared,             desc="R-dir knows it is shared, pass on so it sends an S copy, not E";
+  bool Private, default="false", desc="Requestor already has private permissions, no need for dir check";
+  bool CtoDSinked, default="false", desc="This is true if the CtoD previously sent must have been sunk";
+
+  bool NoAckNeeded, default="false", desc="True if region buffer doesn't need to ack";
+  int Acks, default="0", desc="Acks that the dir (mem ctrl) should expect to receive";
+  CoherenceRequestType OriginalType, default="CoherenceRequestType_NA",  desc="Type of request from core fwded through region buffer";
+
+  bool functionalRead(Packet *pkt) {
+    // Only PUTX messages contains the data block
+    if (Type == CoherenceRequestType:VicDirty) {
+        return testAndRead(addr, DataBlk, pkt);
+    }
+
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return testAndWrite(addr, DataBlk, pkt);
+  }
+}
+
+structure(NBProbeRequestMsg, desc="...", interface="Message") {
+  Addr addr,              desc="Physical address for this request";
+  ProbeRequestType Type,             desc="probe signal";
+  bool ReturnData,              desc="Indicates CPU should return data";
+  NetDest Destination,             desc="Node to whom the data is sent";
+  MessageSizeType MessageSize, desc="size category of the message";
+  bool DemandRequest, default="false", desc="demand request, requesting 3-hop transfer";
+  Addr DemandAddress,        desc="Demand block address for a region request";
+  MachineID Requestor,          desc="Requestor id for 3-hop requests";
+  bool NoAckNeeded, default="false", desc="For short circuting acks";
+
+  bool functionalRead(Packet *pkt) {
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return false;
+  }
+
+}
+
+structure(TDProbeRequestMsg, desc="...", interface="Message") {
+  Addr addr,              desc="Physical address for this request";
+  ProbeRequestType Type,             desc="TD_PrbNxtState signal";
+  bool ReturnData,              desc="Indicates CPU should return data";
+  bool localCtoD,              desc="Indicates CtoD is within the GPU hierarchy (aka TCC subtree)";
+  NetDest Destination,             desc="Node to whom the data is sent";
+  MessageSizeType MessageSize, desc="size category of the message";
+  MachineID Sender,               desc="Node who sent the data";
+  bool currentOwner, default="false", desc="Is the sender the current owner";
+  bool DoneAck, default="false", desc="Is this a done ack?";
+  bool Dirty, default="false", desc="Was block dirty when evicted";
+  bool wasValid, default="false", desc="Was block valid when evicted";
+  bool valid, default="false", desc="Is block valid";
+  bool validToInvalid, default="false", desc="Was block valid when evicted";
+
+  bool functionalRead(Packet *pkt) {
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return false;
+  }
+}
+
+// Response Messages seemed to be easily munged into one type
+structure(ResponseMsg, desc="...", interface="Message") {
+  Addr addr,             desc="Physical address for this request";
+  CoherenceResponseType Type,  desc="NB Sys Resp or CPU Response to Probe";
+  MachineID Sender,               desc="Node who sent the data";
+  NetDest Destination,             desc="Node to whom the data is sent";
+  // Begin Used Only By CPU Response
+  DataBlock DataBlk,           desc="data for the cache line";
+  bool Hit,                    desc="probe hit valid line";
+  bool Shared,                 desc="True if S, or if NB Probe ReturnData==1 && O";
+  bool Dirty,                  desc="Is the data dirty (different than memory)?";
+  bool Ntsl,                   desc="indicates probed lin will be invalid after probe";
+  bool UntransferredOwner,     desc="pending confirmation of ownership change";
+  // End Used Only By CPU Response
+
+  // Begin NB Response Only
+  CoherenceState State, default=CoherenceState_NA,        desc="What returned data from NB should be in";
+  bool CtoD,                    desc="was the originator a CtoD?";
+  // End NB Response Only
+
+  bool NbReqShared,             desc="modification of Shared field from initial request, e.g. hit by shared probe";
+
+  MessageSizeType MessageSize, desc="size category of the message";
+  Cycles InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache";
+  Cycles ForwardRequestTime, default="0", desc="time the dir forwarded the request";
+  Cycles ProbeRequestStartTime, default="0", desc="the time the dir started the probe request";
+  bool DemandRequest, default="false", desc="For profiling purposes";
+
+  bool L3Hit, default="false", desc="Did memory or L3 supply the data?";
+  MachineID OriginalResponder, desc="Mach which wrote the data to the L3";
+
+  bool NotCached, default="false", desc="True when the Region buffer has already evicted the line";
+
+  bool NoAckNeeded, default="false", desc="For short circuting acks";
+  bool isValid, default="false", desc="Is acked block valid";
+
+  bool functionalRead(Packet *pkt) {
+    // Only PUTX messages contains the data block
+    if (Type == CoherenceResponseType:CPUData ||
+        Type == CoherenceResponseType:MemData) {
+        return testAndRead(addr, DataBlk, pkt);
+    }
+
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return testAndWrite(addr, DataBlk, pkt);
+  }
+}
+
+structure(UnblockMsg, desc="...", interface="Message") {
+  Addr addr,              desc="Physical address for this request";
+  NetDest Destination,          desc="Destination (always directory)";
+  MessageSizeType MessageSize, desc="size category of the message";
+}
+
+enumeration(TriggerType, desc="Trigger Type") {
+  L2_to_L1,             desc="L2 to L1 fill";
+  AcksComplete,         desc="NB received all needed Acks";
+
+  // For regions
+  InvNext,              desc="Invalidate the next block";
+  PrivateAck,           desc="Loopback ack for machines with no Region Buffer";
+  AllOutstanding,       desc="All outstanding requests have finished";
+  L3Hit,                desc="L3 hit in dir";
+
+  // For region directory once the directory is blocked
+  InvRegion,            desc="Invalidate region";
+  DowngradeRegion,      desc="downgrade region";
+}
+
+enumeration(CacheId, desc="Which Cache in the Core") {
+  L1I,          desc="L1 I-cache";
+  L1D0,         desc="L1 D-cache cluster 0";
+  L1D1,         desc="L1 D-cache cluster 1";
+  NA,           desc="Default";
+}
+
+structure(TriggerMsg, desc="...", interface="Message") {
+  Addr addr,              desc="Address";
+  TriggerType Type,             desc="Type of trigger";
+  CacheId Dest,         default="CacheId_NA", desc="Cache to invalidate";
+
+  bool functionalRead(Packet *pkt) {
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return false;
+  }
+
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base-RegionBuffer.sm b/src/mem/protocol/MOESI_AMD_Base-RegionBuffer.sm
new file mode 100644
index 000000000..89f7d6fcb
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-RegionBuffer.sm
@@ -0,0 +1,1368 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Jason Power
+ */
+
+machine(MachineType:RegionBuffer, "Region Buffer for AMD_Base-like protocol")
+: CacheMemory *cacheMemory; // stores only region addresses. Must set block size same as below
+  bool isOnCPU;
+  int blocksPerRegion := 64; // 4k regions
+  Cycles toDirLatency := 5;     // Latency to fwd requests to directory
+  Cycles toRegionDirLatency := 5; // Latency for requests and acks to directory
+  Cycles nextEvictLatency := 1;   // latency added between each block while evicting region
+  bool noTCCdir := "False";
+  int TCC_select_num_bits := 1;
+
+  // From the Cores
+  MessageBuffer * requestFromCore, network="From", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseFromCore, network="From", virtual_network="2", vnet_type="response";
+
+  // Requests to the cores or directory
+  MessageBuffer * requestToNetwork, network="To", virtual_network="0", vnet_type="request";
+
+  // From Region-Dir
+  MessageBuffer * notifyFromRegionDir, network="From", virtual_network="7", vnet_type="request";
+  MessageBuffer * probeFromRegionDir, network="From", virtual_network="8", vnet_type="request";
+
+  // From the directory
+  MessageBuffer * unblockFromDir, network="From", virtual_network="4", vnet_type="unblock";
+
+  // To the region-Dir
+  MessageBuffer * responseToRegDir, network="To", virtual_network="2", vnet_type="response";
+
+  MessageBuffer * triggerQueue;
+{
+
+  // States
+  state_declaration(State, desc="Region states", default="RegionBuffer_State_NP") {
+    NP, AccessPermission:Invalid,       desc="Not present in region directory";
+    P,  AccessPermission:Invalid,       desc="Region is private to the cache";
+    S,  AccessPermission:Invalid,       desc="Region is possibly shared with others";
+
+    NP_PS, AccessPermission:Invalid,    desc="Intermediate state waiting for notify from r-dir";
+    S_P,  AccessPermission:Invalid,     desc="Intermediate state while upgrading region";
+
+    P_NP, AccessPermission:Invalid,     desc="Intermediate state while evicting all lines in region";
+    P_S,  AccessPermission:Invalid,     desc="Intermediate state while downgrading all lines in region";
+
+    S_NP_PS, AccessPermission:Invalid,  desc="Got an inv in S_P, waiting for all inv acks, then going to since the write is already out there NP_PS";
+    P_NP_NP,  AccessPermission:Invalid,    desc="Evicting region on repl, then got an inv. Need to re-evict";
+
+    P_NP_O, AccessPermission:Invalid,     desc="Waiting for all outstanding requests";
+    P_S_O,  AccessPermission:Invalid,     desc="Waiting for all outstanding requests";
+    S_O,  AccessPermission:Invalid,       desc="Waiting for all outstanding requests";
+    S_NP_PS_O, AccessPermission:Invalid,  desc="Waiting for all outstanding requests";
+
+    SS_P, AccessPermission:Invalid,  desc="Waiting for CPU write that we know is there";
+
+    P_NP_W, AccessPermission:Invalid,     desc="Waiting for writeback ack";
+
+    NP_W, AccessPermission:Invalid,     desc="Got a done ack before request, waiting for that victim";
+  }
+
+  enumeration(Event, desc="Region directory events") {
+    CPURead,        desc="Access from CPU core";
+    CPUWrite,       desc="Access from CPU core";
+    CPUWriteback,       desc="Writeback request from CPU core";
+
+    ReplRegion,     desc="Start a replace on a region";
+
+    PrivateNotify,  desc="Update entry to private state";
+    SharedNotify,   desc="Update entry to shared state";
+    WbNotify,       desc="Writeback notification received";
+    InvRegion,      desc="Start invalidating a region";
+    DowngradeRegion,desc="Start invalidating a region";
+
+    InvAck,         desc="Ack from core";
+
+    DoneAck,        desc="Ack from core that request has finished";
+    AllOutstanding, desc="All outstanding requests have now finished";
+
+    Evict,          desc="Loopback to evict each block";
+    LastAck_PrbResp, desc="Done eviciting all the blocks, got the last ack from core, now respond to region dir";
+    LastAck_CleanWb, desc="Done eviciting all the blocks, got the last ack from core, now start clean writeback (note the dir has already been updated)";
+
+    StallAccess,    desc="Wait for the done ack on the address before proceeding";
+    StallDoneAck,   desc="Wait for the access on the address before proceeding";
+
+    StaleRequest,   desc="Got a stale victim from the cache, fwd it without incrementing outstanding";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+  structure(BoolVec, external="yes") {
+    bool at(int);
+    void resize(int);
+    void clear();
+    int size();
+  }
+
+  structure(Entry, desc="Region entry", interface="AbstractCacheEntry") {
+    Addr addr,        desc="Base address of this region";
+    State RegionState,      desc="Region state";
+    DataBlock DataBlk,      desc="Data for the block (always empty in region buffer)";
+    BoolVec ValidBlocks,    desc="A vector to keep track of valid blocks";
+    int NumValidBlocks,     desc="Number of trues in ValidBlocks to avoid iterating";
+    BoolVec UsedBlocks,     desc="A vector to keep track of blocks ever valid";
+    bool dirty,           desc="Dirty as best known by the region buffer";
+    // This is needed so we don't ack an invalidate until all requests are ordered
+    int NumOutstandingReqs,    desc="Total outstanding private/shared requests";
+    BoolVec OutstandingReqs,   desc="Blocks that have outstanding private/shared requests";
+    bool MustDowngrade,     desc="Set when we got a downgrade before the shd or pvt permissions";
+    Cycles ProbeRequestTime, default="Cycles(0)", desc="Time region dir started the probe";
+    Cycles InitialRequestTime, default="Cycles(0)", desc="Time message was sent to region dir";
+    bool MsgSentToDir,      desc="True if the current request required a message to the dir";
+    bool clearOnDone, default="false", desc="clear valid bit when request completes";
+    Addr clearOnDoneAddr, desc="clear valid bit when request completes";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,         desc="Transient state";
+    //int NumValidBlocks,     desc="Number of blocks valid so we don't have to count a BoolVec";
+    BoolVec ValidBlocks,    desc="A vector to keep track of valid blocks";
+    bool AllAcksReceived,   desc="Got all necessary acks from dir";
+    bool DoneEvicting,      desc="Done iterating through blocks checking for valids";
+    BoolVec AcksReceived,   desc="Received acks for theses blocks\n";
+    bool SendAck,           desc="If true, send an ack to the r-dir at end of inv";
+    ProbeRequestType MsgType, desc="Type of message to send while 'evicting' ";
+    int NumOutstandingReqs,    desc="Total outstanding private/shared requests";
+    BoolVec OutstandingReqs,   desc="Blocks that have outstanding private/shared requests";
+    MachineID Requestor,    desc="Requestor for three hop transactions";
+    bool DemandRequest, default="false", desc="Associated with a demand request";
+    Addr DemandAddress,  desc="Address for the demand request";
+    bool DoneAckReceived, default="false", desc="True if the done ack arrived before the message";
+    Addr DoneAckAddr,     desc="Address of the done ack received early";
+    int OutstandingThreshold, desc="Number of outstanding requests to trigger AllOutstanding on";
+
+    ProbeRequestType NewMsgType, desc="Type of message to send while 'evicting' ";
+    MachineID NewRequestor,    desc="Requestor for three hop transactions";
+    bool NewDemandRequest, default="false", desc="Associated with a demand request";
+    Addr NewDemandAddress,  desc="Address for the demand request";
+    bool dirty, desc="dirty";
+    bool AllOutstandingTriggered, default="false", desc="bit for only one all outstanding";
+    int OutstandingAcks, default="0", desc="number of acks to wait for";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  // Stores only region addresses
+  TBETable TBEs, template="<RegionBuffer_TBE>", constructor="m_number_of_TBEs";
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  int blockBits,  default="RubySystem::getBlockSizeBits()";
+  int blockBytes, default="RubySystem::getBlockSizeBytes()";
+  int regionBits, default="log2(m_blocksPerRegion)";
+
+  // Functions
+
+  int getRegionOffset(Addr addr) {
+    if (blocksPerRegion > 1) {
+      Addr offset := bitSelect(addr, blockBits, regionBits+blockBits-1);
+      int ret := addressToInt(offset);
+      assert(ret < blocksPerRegion);
+      return ret;
+    } else {
+      return 0;
+    }
+  }
+
+  Addr getRegionBase(Addr addr) {
+    return maskLowOrderBits(addr, blockBits+regionBits);
+  }
+
+  Addr getNextBlock(Addr addr) {
+    Addr a := addr;
+    return makeNextStrideAddress(a, 1);
+  }
+
+  MachineID getPeer(MachineID mach, Addr address) {
+    if (isOnCPU) {
+      return createMachineID(MachineType:CorePair, intToID(0));
+    } else if (noTCCdir) {
+      return mapAddressToRange(address,MachineType:TCC,
+                                  TCC_select_low_bit, TCC_select_num_bits);
+    } else {
+      return createMachineID(MachineType:TCCdir, intToID(0));
+    }
+  }
+
+  bool isOutstanding(TBE tbe, Entry cache_entry, Addr addr) {
+      if (is_valid(tbe) && tbe.OutstandingReqs.size() > 0) {
+          DPRINTF(RubySlicc, " outstanding tbe reqs %s %s %d %d\n",
+                  tbe.OutstandingReqs, addr, getRegionOffset(addr),
+                  tbe.OutstandingReqs.at(getRegionOffset(addr)));
+          return tbe.OutstandingReqs.at(getRegionOffset(addr));
+      } else if (is_valid(cache_entry)) {
+          DPRINTF(RubySlicc, " outstanding cache reqs %s %s %d %d\n",
+                  cache_entry.OutstandingReqs, addr, getRegionOffset(addr),
+                  cache_entry.OutstandingReqs.at(getRegionOffset(addr)));
+          return cache_entry.OutstandingReqs.at(getRegionOffset(addr));
+      } else {
+          return false;
+      }
+  }
+
+  bool isOnGPU() {
+    if (isOnCPU) {
+      return false;
+    }
+    return true;
+  }
+
+  bool isRead(CoherenceRequestType type) {
+    return (type == CoherenceRequestType:RdBlk || type == CoherenceRequestType:RdBlkS ||
+            type == CoherenceRequestType:VicClean);
+  }
+
+  bool presentOrAvail(Addr addr) {
+    return cacheMemory.isTagPresent(getRegionBase(addr)) || cacheMemory.cacheAvail(getRegionBase(addr));
+  }
+
+  // Returns a region entry!
+  Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+    return static_cast(Entry, "pointer", cacheMemory.lookup(getRegionBase(addr)));
+  }
+
+  TBE getTBE(Addr addr), return_by_pointer="yes" {
+    return TBEs.lookup(getRegionBase(addr));
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    return getCacheEntry(getRegionBase(addr)).DataBlk;
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if (is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.RegionState;
+    }
+    return State:NP;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+        tbe.TBEState := state;
+    }
+    if (is_valid(cache_entry)) {
+        cache_entry.RegionState := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := getTBE(addr);
+    if(is_valid(tbe)) {
+      return RegionBuffer_State_to_permission(tbe.TBEState);
+    }
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return RegionBuffer_State_to_permission(cache_entry.RegionState);
+    }
+    return AccessPermission:NotPresent;
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    functionalMemoryRead(pkt);
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    if (functionalMemoryWrite(pkt)) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(RegionBuffer_State_to_permission(state));
+    }
+  }
+
+  void recordRequestType(RequestType stat, Addr addr) {
+    if (stat == RequestType:TagArrayRead) {
+        cacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (stat == RequestType:TagArrayWrite) {
+        cacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:TagArrayRead) {
+      return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+
+  // Overloaded outgoing request nework for both probes to cores and reqeusts
+  // to the directory.
+  // Fix Me: These forwarded requests need to be on a separate virtual channel
+  // to avoid deadlock!
+  out_port(requestNetwork_out, CPURequestMsg, requestToNetwork);
+  out_port(probeNetwork_out, NBProbeRequestMsg, requestToNetwork);
+
+  out_port(responseNetwork_out, ResponseMsg, responseToRegDir);
+
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=4) {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := getTBE(in_msg.addr);
+        DPRINTF(RubySlicc, "trigger msg: %s (%s)\n", in_msg, getRegionBase(in_msg.addr));
+        assert(is_valid(tbe));
+        if (in_msg.Type == TriggerType:AcksComplete) {
+            if (tbe.SendAck) {
+                trigger(Event:LastAck_PrbResp, in_msg.addr, cache_entry, tbe);
+            } else {
+                trigger(Event:LastAck_CleanWb, in_msg.addr, cache_entry, tbe);
+            }
+        } else if (in_msg.Type == TriggerType:AllOutstanding) {
+          trigger(Event:AllOutstanding, in_msg.addr, cache_entry, tbe);
+        } else {
+          assert(in_msg.Type == TriggerType:InvNext);
+          trigger(Event:Evict, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+  in_port(unblockNetwork_in, UnblockMsg, unblockFromDir, rank=3) {
+    if (unblockNetwork_in.isReady(clockEdge())) {
+      peek(unblockNetwork_in, UnblockMsg) {
+        TBE tbe := getTBE(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.DoneAck) {
+          if (isOutstanding(tbe, cache_entry, in_msg.addr)) {
+            trigger(Event:DoneAck, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:StallDoneAck, in_msg.addr, cache_entry, tbe);
+          }
+        } else {
+          assert(is_valid(tbe));
+          trigger(Event:InvAck, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+  in_port(probeNetwork_in, NBProbeRequestMsg, probeFromRegionDir, rank=2) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, NBProbeRequestMsg) {
+        TBE tbe := getTBE(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        assert(getRegionBase(in_msg.addr) == in_msg.addr);
+        if (in_msg.Type == ProbeRequestType:PrbInv) {
+          trigger(Event:InvRegion, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+          trigger(Event:DowngradeRegion, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unknown probe message\n");
+        }
+      }
+    }
+  }
+
+  in_port(notifyNetwork_in, CPURequestMsg, notifyFromRegionDir, rank=1) {
+    if (notifyNetwork_in.isReady(clockEdge())) {
+      peek(notifyNetwork_in, CPURequestMsg) {
+        TBE tbe := getTBE(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        //Fix Me...add back in: assert(is_valid(cache_entry));
+        if (in_msg.Type == CoherenceRequestType:WbNotify) {
+          trigger(Event:WbNotify, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:SharedNotify) {
+          trigger(Event:SharedNotify, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:PrivateNotify) {
+          trigger(Event:PrivateNotify, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unknown notify message\n");
+        }
+      }
+    }
+  }
+
+  // In from cores
+  // NOTE: We get the cache / TBE entry based on the region address,
+  //       but pass the block address to the actions
+  in_port(requestNetwork_in, CPURequestMsg, requestFromCore, rank=0) {
+    if (requestNetwork_in.isReady(clockEdge())) {
+      peek(requestNetwork_in, CPURequestMsg) {
+        TBE tbe := getTBE(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (is_valid(tbe) && tbe.DoneAckReceived && tbe.DoneAckAddr == in_msg.addr) {
+            DPRINTF(RubySlicc, "Stale/Stall request %s\n", in_msg.Type);
+          if (in_msg.Type == CoherenceRequestType:VicDirty || in_msg.Type == CoherenceRequestType:VicClean )
+          {
+            trigger(Event:StaleRequest, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:StallAccess, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (isOutstanding(tbe, cache_entry, in_msg.addr)) {
+          DPRINTF(RubySlicc, "Stall outstanding request %s\n", in_msg.Type);
+          trigger(Event:StallAccess, in_msg.addr, cache_entry, tbe);
+        } else {
+        if (presentOrAvail(in_msg.addr)) {
+          if (in_msg.Type == CoherenceRequestType:RdBlkM ) {
+            trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:WriteThrough ) {
+            trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:Atomic ) {
+            trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+          } else {
+              if (in_msg.Type == CoherenceRequestType:VicDirty ||
+                  in_msg.Type == CoherenceRequestType:VicClean) {
+                  trigger(Event:CPUWriteback, in_msg.addr, cache_entry, tbe);
+              } else {
+                  trigger(Event:CPURead, in_msg.addr, cache_entry, tbe);
+              }
+          }
+        } else {
+          Addr victim := cacheMemory.cacheProbe(getRegionBase(in_msg.addr));
+          TBE victim_tbe := getTBE(victim);
+          Entry victim_entry := getCacheEntry(victim);
+          DPRINTF(RubySlicc, "Replacing region %s for %s(%s)\n", victim, in_msg.addr, getRegionBase(in_msg.addr));
+          trigger(Event:ReplRegion, victim, victim_entry, victim_tbe);
+        }
+        }
+      }
+    }
+  }
+
+  // Actions
+  action(f_fwdReqToDir, "f", desc="Forward CPU request to directory") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) {
+        out_msg.addr := in_msg.addr;
+        out_msg.Type := in_msg.Type;
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.Dirty := in_msg.Dirty;
+        out_msg.Requestor := in_msg.Requestor;
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Destination.add(map_Address_to_Directory(in_msg.addr));
+        out_msg.Shared := in_msg.Shared;
+        out_msg.MessageSize := in_msg.MessageSize;
+        out_msg.Private := true;
+        out_msg.InitialRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := curCycle();
+        if (getState(tbe, cache_entry, address) == State:S) {
+          out_msg.ForceShared := true;
+        }
+        DPRINTF(RubySlicc, "Fwd: %s\n", out_msg);
+        //assert(getState(tbe, cache_entry, address) == State:P || getState(tbe, cache_entry, address) == State:S);
+        if (getState(tbe, cache_entry, address) == State:NP_W) {
+          APPEND_TRANSITION_COMMENT(" fwding stale request: ");
+          APPEND_TRANSITION_COMMENT(out_msg.Type);
+        }
+      }
+    }
+  }
+
+  action(u_updateRegionEntry, "u", desc="Update the entry for profiling") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      if (is_valid(cache_entry)) {
+        if (in_msg.CtoDSinked == false) {
+          APPEND_TRANSITION_COMMENT(" incr outstanding ");
+          cache_entry.NumOutstandingReqs := 1 + cache_entry.NumOutstandingReqs;
+          assert(cache_entry.OutstandingReqs.at(getRegionOffset(address)) == false);
+          cache_entry.OutstandingReqs.at(getRegionOffset(address)) := true;
+          assert(cache_entry.NumOutstandingReqs == countBoolVec(cache_entry.OutstandingReqs));
+        } else {
+          APPEND_TRANSITION_COMMENT(" NOT incr outstanding ");
+          assert(in_msg.Type == CoherenceRequestType:RdBlkM || in_msg.Type == CoherenceRequestType:RdBlkS);
+        }
+        APPEND_TRANSITION_COMMENT(cache_entry.NumOutstandingReqs);
+        if (in_msg.Type == CoherenceRequestType:RdBlkM || in_msg.Type == CoherenceRequestType:Atomic ||
+            in_msg.Type == CoherenceRequestType:WriteThrough )
+        {
+          cache_entry.dirty := true;
+        }
+        if (in_msg.Type == CoherenceRequestType:VicDirty ||
+            in_msg.Type == CoherenceRequestType:VicClean) {
+            DPRINTF(RubySlicc, "Got %s for addr %s\n", in_msg.Type, address);
+            //assert(cache_entry.ValidBlocks.at(getRegionOffset(address)));
+            // can in fact be inv if core got an inv after a vicclean before it got here
+            if (cache_entry.ValidBlocks.at(getRegionOffset(address))) {
+                cache_entry.clearOnDone := true;
+                cache_entry.clearOnDoneAddr := address;
+                //cache_entry.ValidBlocks.at(getRegionOffset(address)) := false;
+                //cache_entry.NumValidBlocks := cache_entry.NumValidBlocks - 1;
+            }
+        } else {
+            if (cache_entry.ValidBlocks.at(getRegionOffset(address)) == false) {
+              cache_entry.NumValidBlocks := cache_entry.NumValidBlocks + 1;
+            }
+            DPRINTF(RubySlicc, "before valid addr %s bits %s\n",
+                    in_msg.Type, address, cache_entry.ValidBlocks);
+            cache_entry.ValidBlocks.at(getRegionOffset(address)) := true;
+            DPRINTF(RubySlicc, "after valid addr %s bits %s\n",
+                    in_msg.Type, address, cache_entry.ValidBlocks);
+            cache_entry.UsedBlocks.at(getRegionOffset(address)) := true;
+        }
+        assert(cache_entry.NumValidBlocks <= blocksPerRegion);
+        assert(cache_entry.NumValidBlocks >= 0);
+        APPEND_TRANSITION_COMMENT(" valid blocks ");
+        APPEND_TRANSITION_COMMENT(cache_entry.ValidBlocks);
+      } else {
+        error("This shouldn't happen anymore I think");
+        //tbe.ValidBlocks.at(getRegionOffest(address)) := true;
+        assert(getState(tbe, cache_entry, address) == State:P_NP);
+      }
+    }
+  }
+
+  action(uw_updatePossibleWriteback, "uw", desc="writeback request complete") {
+      peek(unblockNetwork_in, UnblockMsg) {
+          if (is_valid(cache_entry) && in_msg.validToInvalid &&
+              cache_entry.clearOnDone && cache_entry.clearOnDoneAddr == address) {
+              DPRINTF(RubySlicc, "I have no idea what is going on here\n");
+              cache_entry.ValidBlocks.at(getRegionOffset(address)) := false;
+              cache_entry.NumValidBlocks := cache_entry.NumValidBlocks - 1;
+              cache_entry.clearOnDone := false;
+          }
+      }
+  }
+
+
+  action(rp_requestPrivate, "rp", desc="Send private request r-dir") {
+      peek(requestNetwork_in, CPURequestMsg) {
+          // No need to send acks on replacements
+          assert(is_invalid(tbe));
+          enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) {
+              out_msg.addr := address; // use the actual address so the demand request can be fulfilled
+              out_msg.DemandAddress := address;
+              out_msg.Type := CoherenceRequestType:PrivateRequest;
+              out_msg.OriginalType := in_msg.Type;
+              out_msg.Requestor := machineID;
+              out_msg.WTRequestor := in_msg.WTRequestor;
+              out_msg.InitialRequestTime := curCycle();
+              // will this always be ok? probably not for multisocket
+              out_msg.Destination.add(map_Address_to_RegionDir(address));
+              out_msg.MessageSize := MessageSizeType:Request_Control;
+              DPRINTF(RubySlicc, "Private request %s\n", out_msg);
+          }
+          cache_entry.ProbeRequestTime := curCycle();
+          cache_entry.MsgSentToDir := true;
+          APPEND_TRANSITION_COMMENT(getRegionBase(address));
+      }
+  }
+
+  action(ru_requestUpgrade, "ru", desc="Send upgrade request r-dir") {
+      peek(requestNetwork_in, CPURequestMsg) {
+          // No need to send acks on replacements
+          assert(is_invalid(tbe));
+          enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) {
+              out_msg.addr := address; // use the actual address so the demand request can be fulfilled
+              out_msg.Type := CoherenceRequestType:UpgradeRequest;
+              out_msg.OriginalType := in_msg.Type;
+              out_msg.Requestor := machineID;
+              out_msg.WTRequestor := in_msg.WTRequestor;
+              out_msg.InitialRequestTime := curCycle();
+              // will this always be ok? probably not for multisocket
+              out_msg.Destination.add(map_Address_to_RegionDir(address));
+              out_msg.MessageSize := MessageSizeType:Request_Control;
+          }
+          cache_entry.ProbeRequestTime := curCycle();
+          cache_entry.MsgSentToDir := true;
+          APPEND_TRANSITION_COMMENT(getRegionBase(address));
+      }
+  }
+
+  action(rw_requestWriteback, "rq", desc="Send writeback request") {
+    // No need to send acks on replacements
+    enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) {
+        out_msg.addr := getRegionBase(address); // use the actual address so the demand request can be fulfilled
+        out_msg.Type := CoherenceRequestType:CleanWbRequest;
+        out_msg.Requestor := machineID;
+        out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+        out_msg.MessageSize := MessageSizeType:Request_Control;
+        out_msg.Dirty := tbe.dirty;
+          APPEND_TRANSITION_COMMENT(getRegionBase(address));
+    }
+  }
+
+  action(rs_requestShared, "rs", desc="Send shared request r-dir") {
+      peek(requestNetwork_in, CPURequestMsg) {
+          // No need to send acks on replacements
+          assert(is_invalid(tbe));
+          enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) {
+              out_msg.addr := address; // use the actual address so the demand request can be fulfilled
+              out_msg.Type := CoherenceRequestType:SharedRequest;
+              out_msg.OriginalType := in_msg.Type;
+              out_msg.Requestor := machineID;
+              out_msg.WTRequestor := in_msg.WTRequestor;
+              out_msg.InitialRequestTime := curCycle();
+              // will this always be ok? probably not for multisocket
+              out_msg.Destination.add(map_Address_to_RegionDir(address));
+              out_msg.MessageSize := MessageSizeType:Request_Control;
+          }
+          cache_entry.ProbeRequestTime := curCycle();
+          cache_entry.MsgSentToDir := true;
+          APPEND_TRANSITION_COMMENT(getRegionBase(address));
+      }
+  }
+
+  action(ai_ackRegionInv, "ai", desc="Send ack to r-dir on region inv if tbe says so") {
+    // No need to send acks on replacements
+    assert(is_valid(tbe));
+    enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+        out_msg.addr := getRegionBase(address);
+        out_msg.Type := CoherenceResponseType:CPUPrbResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(ad_ackDircetory, "ad", desc="send probe response to directory") {
+    if (noTCCdir && tbe.MsgType == ProbeRequestType:PrbDowngrade && isOnGPU()) { //VIPER tcc doesnt understand PrbShrData
+      assert(tbe.DemandRequest);                                    //So, let RegionBuffer take care of sending back ack
+      enqueue(responseNetwork_out, ResponseMsg, toDirLatency) {
+          out_msg.addr := tbe.DemandAddress;
+          out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+          out_msg.Sender := getPeer(machineID,address);
+          out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+          out_msg.Dirty := false;  // only true if sending back data i think
+          out_msg.Hit := false;
+          out_msg.Ntsl := false;
+          out_msg.State := CoherenceState:NA;
+          out_msg.NoAckNeeded := true;
+          out_msg.MessageSize := MessageSizeType:Response_Control;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(aie_ackRegionExclusiveInv, "aie", desc="Send ack to r-dir on region inv if tbe says so") {
+    // No need to send acks on replacements
+    assert(is_valid(tbe));
+    enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+        out_msg.addr := getRegionBase(address);
+        out_msg.Type := CoherenceResponseType:CPUPrbResp;
+        out_msg.Sender := machineID;
+        out_msg.NotCached := true;
+        out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        out_msg.Dirty := tbe.dirty;
+    }
+  }
+
+  action(ain_ackRegionInvNow, "ain", desc="Send ack to r-dir on region inv") {
+    enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+      out_msg.addr := getRegionBase(address);
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(aine_ackRegionInvExlusiveNow, "aine", desc="Send ack to r-dir on region inv with exlusive permission") {
+    enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+      out_msg.addr := getRegionBase(address);
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.NotCached := true;
+      out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(ap_ackPrivateNotify, "ap", desc="Send ack to r-dir on private notify") {
+    enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+      out_msg.addr := getRegionBase(address);
+      out_msg.Type := CoherenceResponseType:PrivateAck;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(aw_ackWbNotify, "aw", desc="Send ack to r-dir on writeback notify") {
+    peek(notifyNetwork_in, CPURequestMsg) {
+      if (in_msg.NoAckNeeded == false) {
+        enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+          out_msg.addr := getRegionBase(address);
+          out_msg.Type := CoherenceResponseType:RegionWbAck;
+          out_msg.Sender := machineID;
+          out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+          out_msg.MessageSize := MessageSizeType:Response_Control;
+        }
+      }
+    }
+  }
+
+  action(e_evictCurrent, "e", desc="Evict this block in the region") {
+    // send force invalidate message to directory to invalidate this block
+    // must invalidate all blocks since region buffer could have privitized it
+      if (tbe.ValidBlocks.at(getRegionOffset(address)) &&
+          (tbe.DemandRequest == false || tbe.DemandAddress != address)) {
+          DPRINTF(RubySlicc, "trying to evict address %s (base: %s, offset: %d)\n", address, getRegionBase(address), getRegionOffset(address));
+          DPRINTF(RubySlicc, "tbe valid blocks %s\n", tbe.ValidBlocks);
+
+        enqueue(probeNetwork_out, NBProbeRequestMsg, 1) {
+            out_msg.addr := address;
+            out_msg.Type := tbe.MsgType;
+            out_msg.ReturnData := true;
+            if (address == tbe.DemandAddress) {
+                out_msg.DemandRequest := true;
+            }
+            out_msg.MessageSize := MessageSizeType:Control;
+            out_msg.Destination.add(getPeer(machineID,address));
+            DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+        APPEND_TRANSITION_COMMENT(" current ");
+        APPEND_TRANSITION_COMMENT(tbe.ValidBlocks.at(getRegionOffset(address)));
+        tbe.AllAcksReceived := false;
+      } else {
+          DPRINTF(RubySlicc, "Not evicting demand %s\n", address);
+      }
+  }
+
+  action(ed_evictDemand, "ed", desc="Evict the demand request if it's valid") {
+    if (noTCCdir && tbe.MsgType == ProbeRequestType:PrbDowngrade && isOnGPU()) {
+      tbe.OutstandingAcks := 0;
+      tbe.AllAcksReceived := true;
+      tbe.DoneEvicting := true;
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+          out_msg.Type := TriggerType:AcksComplete;
+          out_msg.addr := getRegionBase(address);
+      }
+    } else if (tbe.DemandRequest) {
+      enqueue(probeNetwork_out, NBProbeRequestMsg, 1) {
+        out_msg.addr := tbe.DemandAddress;
+        out_msg.Type := tbe.MsgType;
+        out_msg.ReturnData := true;
+        out_msg.DemandRequest := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination.add(getPeer(machineID,address));
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+        tbe.AllAcksReceived := false;
+      }
+      if (tbe.ValidBlocks.at(getRegionOffset(tbe.DemandAddress)) == false) {
+          tbe.OutstandingAcks := tbe.OutstandingAcks + 1;
+      }
+      APPEND_TRANSITION_COMMENT("Evicting demand ");
+      APPEND_TRANSITION_COMMENT(tbe.DemandAddress);
+    }
+    APPEND_TRANSITION_COMMENT("waiting acks ");
+    APPEND_TRANSITION_COMMENT(tbe.OutstandingAcks);
+  }
+
+  action(adp_AckDemandProbe, "fp", desc="forward demand probe even if we know that the core is invalid") {
+    peek(probeNetwork_in, NBProbeRequestMsg) {
+        if (in_msg.DemandRequest) {
+            enqueue(responseNetwork_out, ResponseMsg, toDirLatency) {
+                out_msg.addr := in_msg.DemandAddress;
+                out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+                out_msg.Sender := getPeer(machineID,address);
+                out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+                out_msg.Dirty := false;  // only true if sending back data i think
+                out_msg.Hit := false;
+                out_msg.Ntsl := false;
+                out_msg.State := CoherenceState:NA;
+                out_msg.NoAckNeeded := true;
+                out_msg.MessageSize := MessageSizeType:Response_Control;
+                DPRINTF(RubySlicc, "%s\n", out_msg);
+            }
+        }
+    }
+  }
+
+  action(en_enqueueNextEvict, "en", desc="Queue evict the next block in the region") {
+    // increment in_msg.addr by blockSize bytes and enqueue on triggerPort
+    // Only enqueue if the next address doesn't overrun the region bound
+    if (getRegionBase(getNextBlock(address)) == getRegionBase(address)) {
+        enqueue(triggerQueue_out, TriggerMsg, nextEvictLatency) {
+            out_msg.Type := TriggerType:InvNext;
+            out_msg.addr := getNextBlock(address);
+        }
+    } else {
+        tbe.DoneEvicting := true;
+        DPRINTF(RubySlicc, "Done evicing region %s\n", getRegionBase(address));
+        DPRINTF(RubySlicc, "Waiting for %s acks\n", tbe.OutstandingAcks);
+        if (tbe.AllAcksReceived == true) {
+            enqueue(triggerQueue_out, TriggerMsg, 1) {
+                out_msg.Type := TriggerType:AcksComplete;
+                out_msg.addr := getRegionBase(address);
+            }
+        }
+    }
+  }
+
+  action(ef_enqueueFirstEvict, "ef", desc="Queue the first block in the region to be evicted") {
+    if (tbe.DoneEvicting == false) {
+      enqueue(triggerQueue_out, TriggerMsg, nextEvictLatency) {
+          out_msg.Type := TriggerType:InvNext;
+          out_msg.addr := getRegionBase(address);
+      }
+    }
+  }
+
+  action(ra_receiveAck, "ra", desc="Mark TBE entry as received this ack") {
+      DPRINTF(RubySlicc, "received ack for %s reg: %s vec: %s pos: %d\n",
+              address, getRegionBase(address), tbe.ValidBlocks, getRegionOffset(address));
+      peek(unblockNetwork_in, UnblockMsg) {
+          //
+          // Note the tbe ValidBlock vec will be a conservative list of the
+          // valid blocks since the cache entry ValidBlock vec is set on the
+          // request
+          //
+          if (in_msg.wasValid) {
+              assert(tbe.ValidBlocks.at(getRegionOffset(address)));
+          }
+      }
+      tbe.OutstandingAcks := tbe.OutstandingAcks - 1;
+      tbe.AcksReceived.at(getRegionOffset(address)) := true;
+      assert(tbe.OutstandingAcks >= 0);
+      if (tbe.OutstandingAcks == 0) {
+          tbe.AllAcksReceived := true;
+          if (tbe.DoneEvicting) {
+              enqueue(triggerQueue_out, TriggerMsg, 1) {
+                  out_msg.Type := TriggerType:AcksComplete;
+                  out_msg.addr := getRegionBase(address);
+              }
+          }
+      }
+
+      APPEND_TRANSITION_COMMENT(getRegionBase(address));
+      APPEND_TRANSITION_COMMENT(" Acks left receive ");
+      APPEND_TRANSITION_COMMENT(tbe.OutstandingAcks);
+  }
+
+  action(do_decrementOutstanding, "do", desc="Decrement outstanding requests") {
+    APPEND_TRANSITION_COMMENT(" decr outstanding ");
+    if (is_valid(cache_entry)) {
+      cache_entry.NumOutstandingReqs := cache_entry.NumOutstandingReqs - 1;
+      assert(cache_entry.OutstandingReqs.at(getRegionOffset(address)));
+      cache_entry.OutstandingReqs.at(getRegionOffset(address)) := false;
+      assert(cache_entry.NumOutstandingReqs >= 0);
+      assert(cache_entry.NumOutstandingReqs == countBoolVec(cache_entry.OutstandingReqs));
+      APPEND_TRANSITION_COMMENT(cache_entry.NumOutstandingReqs);
+    }
+    if (is_valid(tbe)) {
+      tbe.NumOutstandingReqs := tbe.NumOutstandingReqs - 1;
+      assert(tbe.OutstandingReqs.at(getRegionOffset(address)));
+      tbe.OutstandingReqs.at(getRegionOffset(address)) := false;
+      assert(tbe.NumOutstandingReqs >= 0);
+      assert(tbe.NumOutstandingReqs == countBoolVec(tbe.OutstandingReqs));
+      APPEND_TRANSITION_COMMENT(tbe.NumOutstandingReqs);
+    }
+  }
+
+  action(co_checkOutstanding, "co", desc="check if there are no more outstanding requests") {
+    assert(is_valid(tbe));
+    if ((tbe.NumOutstandingReqs <= tbe.OutstandingThreshold) &&
+        (tbe.AllOutstandingTriggered == false)) {
+      APPEND_TRANSITION_COMMENT(" no more outstanding: ");
+      APPEND_TRANSITION_COMMENT(tbe.NumOutstandingReqs);
+      APPEND_TRANSITION_COMMENT(tbe.OutstandingThreshold);
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+          out_msg.Type := TriggerType:AllOutstanding;
+          if (tbe.DemandRequest) {
+              out_msg.addr := tbe.DemandAddress;
+          } else {
+              out_msg.addr := getRegionBase(address);
+          }
+          DPRINTF(RubySlicc, "co enqueuing %s\n", out_msg);
+          tbe.AllOutstandingTriggered := true;
+      }
+    } else {
+      APPEND_TRANSITION_COMMENT(" still more outstanding ");
+    }
+  }
+
+  action(ro_resetAllOutstanding, "ro", desc="Reset all outstanding") {
+      tbe.AllOutstandingTriggered := false;
+  }
+
+  action(so_setOutstandingCheckOne, "so", desc="Check outstanding is waiting for 1, not 0") {
+    // Need this for S_P because one request is outstanding between here and r-dir
+    tbe.OutstandingThreshold := 1;
+  }
+
+  action(a_allocateRegionEntry, "a", desc="Allocate a new entry") {
+    set_cache_entry(cacheMemory.allocate(getRegionBase(address), new Entry));
+    cache_entry.ValidBlocks.clear();
+    cache_entry.ValidBlocks.resize(blocksPerRegion);
+    cache_entry.UsedBlocks.clear();
+    cache_entry.UsedBlocks.resize(blocksPerRegion);
+    cache_entry.dirty := false;
+    cache_entry.NumOutstandingReqs := 0;
+    cache_entry.OutstandingReqs.clear();
+    cache_entry.OutstandingReqs.resize(blocksPerRegion);
+  }
+
+  action(d_deallocateRegionEntry, "d", desc="Deallocate region entry") {
+    cacheMemory.deallocate(getRegionBase(address));
+    unset_cache_entry();
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    TBEs.allocate(getRegionBase(address));
+    set_tbe(getTBE(address));
+    tbe.OutstandingAcks := 0;
+    tbe.AllAcksReceived := true; // starts true since the region could be empty
+    tbe.DoneEvicting := false;
+    tbe.AcksReceived.clear();
+    tbe.AcksReceived.resize(blocksPerRegion);
+    tbe.SendAck := false;
+    tbe.OutstandingThreshold := 0;
+    if (is_valid(cache_entry)) {
+      tbe.NumOutstandingReqs := cache_entry.NumOutstandingReqs;
+      tbe.OutstandingReqs := cache_entry.OutstandingReqs;
+      assert(tbe.NumOutstandingReqs == countBoolVec(tbe.OutstandingReqs));
+      tbe.dirty := cache_entry.dirty;
+      tbe.ValidBlocks := cache_entry.ValidBlocks;
+      tbe.OutstandingAcks := countBoolVec(tbe.ValidBlocks);
+      APPEND_TRANSITION_COMMENT(" tbe valid blocks ");
+      APPEND_TRANSITION_COMMENT(tbe.ValidBlocks);
+      APPEND_TRANSITION_COMMENT(" cache valid blocks ");
+      APPEND_TRANSITION_COMMENT(cache_entry.ValidBlocks);
+    } else {
+      tbe.dirty := false;
+    }
+  }
+
+  action(m_markSendAck, "m", desc="Mark TBE that we need to ack at end") {
+    assert(is_valid(tbe));
+    tbe.SendAck := true;
+  }
+
+  action(db_markDirtyBit, "db", desc="Mark TBE dirty bit") {
+      peek(unblockNetwork_in, UnblockMsg) {
+          if (is_valid(tbe)) {
+              tbe.dirty := tbe.dirty || in_msg.Dirty;
+          }
+      }
+  }
+
+  action(dr_markDoneAckReceived, "dr", desc="Mark TBE that a done ack has been received") {
+    assert(is_valid(tbe));
+    tbe.DoneAckReceived := true;
+    tbe.DoneAckAddr := address;
+    APPEND_TRANSITION_COMMENT(" marking done ack on TBE ");
+  }
+
+  action(se_setTBE, "se", desc="Set msg type to evict") {
+    peek(probeNetwork_in, NBProbeRequestMsg) {
+        tbe.MsgType := in_msg.Type;
+        tbe.Requestor := in_msg.Requestor;
+        tbe.DemandAddress := in_msg.DemandAddress;
+        tbe.DemandRequest := in_msg.DemandRequest;
+    }
+  }
+
+  action(sne_setNewTBE, "sne", desc="Set msg type to evict") {
+    peek(probeNetwork_in, NBProbeRequestMsg) {
+        tbe.NewMsgType := in_msg.Type;
+        tbe.NewRequestor := in_msg.Requestor;
+        tbe.NewDemandAddress := in_msg.DemandAddress;
+        tbe.NewDemandRequest := in_msg.DemandRequest;
+    }
+  }
+
+  action(soe_setOldTBE, "soe", desc="Set msg type to evict") {
+    tbe.MsgType := tbe.NewMsgType;
+    tbe.Requestor := tbe.NewRequestor;
+    tbe.DemandAddress := tbe.NewDemandAddress;
+    tbe.DemandRequest := tbe.NewDemandRequest;
+    tbe.OutstandingAcks :=  countBoolVec(tbe.ValidBlocks);
+    tbe.AllAcksReceived := true; // starts true since the region could be empty
+    tbe.DoneEvicting := false;
+    tbe.AcksReceived.clear();
+    tbe.AcksReceived.resize(blocksPerRegion);
+    tbe.SendAck := false;
+  }
+
+  action(ser_setTBE, "ser", desc="Set msg type to evict repl") {
+    tbe.MsgType := ProbeRequestType:PrbInv;
+  }
+
+  action(md_setMustDowngrade, "md", desc="When permissions finally get here, must be shared") {
+    assert(is_valid(cache_entry));
+    cache_entry.MustDowngrade := true;
+  }
+
+  action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+    TBEs.deallocate(getRegionBase(address));
+    unset_tbe();
+  }
+
+  action(p_popRequestQueue, "p", desc="Pop the request queue") {
+    requestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pl_popUnblockQueue, "pl", desc="Pop the unblock queue") {
+    unblockNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pn_popNotifyQueue, "pn", desc="Pop the notify queue") {
+    notifyNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="Pop the probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pt_popTriggerQueue, "pt", desc="Pop the trigger queue") {
+    DPRINTF(RubySlicc, "Trigger Before Contents: %s\n", triggerQueue_in);
+    triggerQueue_in.dequeue(clockEdge());
+    DPRINTF(RubySlicc, "Trigger After Contents: %s\n", triggerQueue_in);
+  }
+
+  // Must always use wake all, since non-region address wait on region addresses
+  action(wa_wakeUpAllDependents, "wa", desc="Wake up any requests waiting for this region") {
+    wakeUpAllBuffers();
+  }
+
+  action(zz_stallAndWaitRequestQueue, "\z", desc="recycle request queue") {
+    Addr regAddr := getRegionBase(address);
+    DPRINTF(RubySlicc, "Stalling address %s\n", regAddr);
+    stall_and_wait(requestNetwork_in, regAddr);
+  }
+
+  action(yy_stallAndWaitProbeQueue, "\y", desc="stall probe queue") {
+    Addr regAddr := getRegionBase(address);
+    stall_and_wait(probeNetwork_in, regAddr);
+  }
+
+  action(yyy_recycleProbeQueue, "\yy", desc="recycle probe queue") {
+    probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(zzz_recycleRequestQueue, "\zz", desc="recycle request queue") {
+    requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(www_recycleUnblockNetwork, "\ww", desc="recycle unblock queue") {
+    unblockNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(z_stall, "z", desc="stall request queue") {
+    // fake state
+  }
+
+  action(mru_setMRU, "mru", desc="set MRU") {
+    cacheMemory.setMRU(address, cache_entry.NumValidBlocks);
+  }
+
+  // Transitions
+
+  transition({NP_PS, S_P, S_NP_PS, P_NP, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, P_NP_W, P_NP_NP, NP_W}, {CPURead, CPUWriteback, CPUWrite}) {} {
+    zz_stallAndWaitRequestQueue;
+  }
+
+  transition(SS_P, {CPURead, CPUWriteback}) {
+    zz_stallAndWaitRequestQueue;
+  }
+
+  transition({NP, S, P, NP_PS, S_P, S_NP_PS, P_NP, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, SS_P, NP_W, P_NP_NP}, StallAccess) {} {
+    zz_stallAndWaitRequestQueue;
+  }
+
+  transition({S, P, NP_PS, S_P, S_NP_PS, P_NP, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, SS_P, P_NP_W, P_NP_NP, NP_W}, StallDoneAck) {
+    www_recycleUnblockNetwork;
+  }
+
+  transition(NP, StallDoneAck, NP_W) {
+    t_allocateTBE;
+    db_markDirtyBit;
+    dr_markDoneAckReceived;
+    pl_popUnblockQueue;
+  }
+
+  transition(NP_W, StaleRequest, NP) {
+    f_fwdReqToDir;
+    dt_deallocateTBE;
+    wa_wakeUpAllDependents;
+    p_popRequestQueue;
+  }
+
+  transition(P_NP_O, DowngradeRegion) {} {
+    z_stall; // should stall and wait
+  }
+
+  transition({NP_PS, S_NP_PS, S_P, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, SS_P}, ReplRegion) {} {
+    zz_stallAndWaitRequestQueue; // can't let things get out of order!
+  }
+
+  transition({P_NP_O, S_O, SS_P}, InvRegion) {} {
+    yyy_recycleProbeQueue; // can't be z_stall because there could be a RdBlkM in the requestQueue which has the sinked flag which is blocking the inv
+  }
+
+  transition(P_NP, {InvRegion, DowngradeRegion}, P_NP_NP) {} {
+    sne_setNewTBE;
+    pp_popProbeQueue;
+  }
+
+  transition(S_P, DowngradeRegion) {} {
+    adp_AckDemandProbe;
+    ain_ackRegionInvNow;
+    pp_popProbeQueue;
+  }
+
+  transition(P_NP_W, InvRegion) {
+    adp_AckDemandProbe;
+    ain_ackRegionInvNow;
+    pp_popProbeQueue;
+  }
+
+  transition(P_NP_W, DowngradeRegion) {
+    adp_AckDemandProbe;
+    aine_ackRegionInvExlusiveNow;
+    pp_popProbeQueue;
+  }
+
+  transition({P, S}, {CPURead, CPUWriteback}) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    f_fwdReqToDir;
+    u_updateRegionEntry;
+    p_popRequestQueue;
+  }
+
+  transition(P, CPUWrite) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    f_fwdReqToDir;
+    u_updateRegionEntry;
+    p_popRequestQueue;
+  }
+
+  transition(S, CPUWrite, S_O) {TagArrayRead} {
+    mru_setMRU;
+    t_allocateTBE;
+    co_checkOutstanding;
+    zz_stallAndWaitRequestQueue;
+  }
+
+  transition(S_O, AllOutstanding, SS_P) {
+    wa_wakeUpAllDependents;
+    ro_resetAllOutstanding;
+    pt_popTriggerQueue;
+  }
+
+  transition(SS_P, CPUWrite, S_P) {
+    mru_setMRU;
+    dt_deallocateTBE;
+    ru_requestUpgrade;
+    u_updateRegionEntry;
+    p_popRequestQueue;
+  }
+
+  transition(NP, {CPURead, CPUWriteback}, NP_PS) {TagArrayRead, TagArrayWrite} {
+    a_allocateRegionEntry;
+    rs_requestShared;
+    u_updateRegionEntry;
+    p_popRequestQueue;//zz_stallAndWaitRequestQueue;
+  }
+
+  transition(NP, CPUWrite, NP_PS) {TagArrayRead, TagArrayWrite} {
+    a_allocateRegionEntry;
+    rp_requestPrivate;
+    u_updateRegionEntry;
+    p_popRequestQueue;//zz_stallAndWaitRequestQueue;
+  }
+
+  transition(NP_PS, PrivateNotify, P) {} {
+    ap_ackPrivateNotify;
+    wa_wakeUpAllDependents;
+    pn_popNotifyQueue;
+  }
+
+  transition(S_P, PrivateNotify, P) {} {
+    ap_ackPrivateNotify;
+    wa_wakeUpAllDependents;
+    pn_popNotifyQueue;
+  }
+
+  transition(NP_PS, SharedNotify, S) {} {
+    ap_ackPrivateNotify;
+    wa_wakeUpAllDependents;
+    pn_popNotifyQueue;
+  }
+
+  transition(P_NP_W, WbNotify, NP) {} {
+    aw_ackWbNotify;
+    wa_wakeUpAllDependents;
+    dt_deallocateTBE;
+    pn_popNotifyQueue;
+  }
+
+  transition({P, S}, ReplRegion, P_NP_O) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    ser_setTBE;
+    d_deallocateRegionEntry;
+    co_checkOutstanding;
+  }
+
+  transition({P, S}, InvRegion, P_NP_O) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    se_setTBE;
+    m_markSendAck;
+    d_deallocateRegionEntry;
+    co_checkOutstanding;
+    pp_popProbeQueue;
+  }
+
+  transition(P_NP_O, AllOutstanding, P_NP) {} {
+    ed_evictDemand;
+    ef_enqueueFirstEvict;
+    ro_resetAllOutstanding;
+    pt_popTriggerQueue;
+  }
+
+  transition(S_P, InvRegion, S_NP_PS_O) {TagArrayRead} {
+    t_allocateTBE;
+    se_setTBE;
+    m_markSendAck;
+    so_setOutstandingCheckOne;
+    co_checkOutstanding;
+    pp_popProbeQueue;
+  }
+
+  transition(S_NP_PS_O, AllOutstanding, S_NP_PS) {
+    ed_evictDemand;
+    ef_enqueueFirstEvict;
+    ro_resetAllOutstanding;
+    pt_popTriggerQueue;
+  }
+
+  transition(P, DowngradeRegion, P_S_O) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    se_setTBE;
+    m_markSendAck;
+    co_checkOutstanding;
+    pp_popProbeQueue;
+  }
+
+  transition(P_S_O, AllOutstanding, P_S) {} {
+    ed_evictDemand;
+    ef_enqueueFirstEvict;
+    ro_resetAllOutstanding;
+    pt_popTriggerQueue;
+  }
+
+  transition({P, S}, DoneAck) {TagArrayWrite} {
+    do_decrementOutstanding;
+    wa_wakeUpAllDependents;
+    db_markDirtyBit;
+    uw_updatePossibleWriteback;
+    pl_popUnblockQueue;
+  }
+
+  transition({S_P, NP_PS, S_NP_PS}, DoneAck) {TagArrayWrite} {
+      www_recycleUnblockNetwork;
+  }
+
+  transition({P_NP_O, S_NP_PS_O, P_S_O, S_O}, DoneAck) {} {
+    do_decrementOutstanding;
+    co_checkOutstanding;
+    db_markDirtyBit;
+    uw_updatePossibleWriteback;
+    pl_popUnblockQueue;
+  }
+
+  transition({P_NP, P_S, S_NP_PS, P_NP_NP}, Evict) {} {
+    e_evictCurrent;
+    en_enqueueNextEvict;
+    pt_popTriggerQueue;
+  }
+
+  transition({P_NP, P_S, S_NP_PS, P_NP_NP}, InvAck) {} {
+    ra_receiveAck;
+    db_markDirtyBit;
+    pl_popUnblockQueue;
+  }
+
+  transition(P_NP, LastAck_CleanWb, P_NP_W) {} {
+      rw_requestWriteback;
+      pt_popTriggerQueue;
+  }
+
+  transition(P_NP_NP, LastAck_CleanWb, P_NP) {} {
+    soe_setOldTBE;
+    m_markSendAck;
+    ed_evictDemand;
+    ef_enqueueFirstEvict;
+    pt_popTriggerQueue;
+  }
+
+  transition(P_NP, LastAck_PrbResp, NP) {} {
+    aie_ackRegionExclusiveInv;
+    dt_deallocateTBE;
+    wa_wakeUpAllDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(S_NP_PS, LastAck_PrbResp, NP_PS) {} {
+    aie_ackRegionExclusiveInv;
+    dt_deallocateTBE;
+    wa_wakeUpAllDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(P_S, LastAck_PrbResp, S) {} {
+    ai_ackRegionInv;
+    ad_ackDircetory;
+    dt_deallocateTBE;
+    wa_wakeUpAllDependents;
+    pt_popTriggerQueue;
+  }
+
+}
+
diff --git a/src/mem/protocol/MOESI_AMD_Base-RegionDir.sm b/src/mem/protocol/MOESI_AMD_Base-RegionDir.sm
new file mode 100644
index 000000000..b392311c5
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-RegionDir.sm
@@ -0,0 +1,1187 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Jason Power
+ */
+
+machine(MachineType:RegionDir, "Region Directory for AMD_Base-like protocol")
+: CacheMemory *cacheMemory; // stores only region addresses. Must set block size same as below
+  NodeID cpuRegionBufferNum;
+  NodeID gpuRegionBufferNum;
+  int blocksPerRegion := 64; // 4k regions
+  Cycles toDirLatency := 10;    // Latency to fwd requests and send invs to directory
+  bool always_migrate := "False";
+  bool sym_migrate := "False";
+  bool asym_migrate := "False";
+  bool noTCCdir := "False";
+  int TCC_select_num_bits := 1;
+
+  // To the directory
+  MessageBuffer * requestToDir, network="To", virtual_network="5", vnet_type="request";
+
+  // To the region buffers
+  MessageBuffer * notifyToRBuffer, network="To", virtual_network="7", vnet_type="request";
+  MessageBuffer * probeToRBuffer, network="To", virtual_network="8", vnet_type="request";
+
+  // From the region buffers
+  MessageBuffer * responseFromRBuffer, network="From", virtual_network="2", vnet_type="response";
+  MessageBuffer * requestFromRegBuf, network="From", virtual_network="0", vnet_type="request";
+
+  MessageBuffer * triggerQueue;
+{
+
+  // States
+  state_declaration(State, desc="Region states", default="RegionDir_State_NP") {
+    NP, AccessPermission:Invalid,       desc="Not present in region directory";
+    P,  AccessPermission:Invalid,       desc="Region is private to owner";
+    S,  AccessPermission:Invalid,       desc="Region is shared between CPU and GPU";
+
+    P_NP,  AccessPermission:Invalid,    desc="Evicting the region";
+    NP_P,  AccessPermission:Invalid,    desc="Must wait for ack from R-buf";
+    NP_S,  AccessPermission:Invalid,    desc="Must wait for ack from R-buf";
+    P_P,   AccessPermission:Invalid,    desc="Waiting for ack from R-buf";
+    S_S,   AccessPermission:Invalid,    desc="Waiting for ack from R-buf";
+    P_S,   AccessPermission:Invalid,    desc="Downgrading the region";
+    S_P,   AccessPermission:Invalid,    desc="Upgrading the region";
+    P_AS,  AccessPermission:Invalid,    desc="Sent invalidates, waiting for acks";
+    S_AP,  AccessPermission:Invalid,    desc="Sent invalidates, waiting for acks";
+    P_AP,  AccessPermission:Invalid,    desc="Sent invalidates, waiting for acks";
+
+    SP_NP_W, AccessPermission:Invalid,   desc="Last sharer writing back, waiting for ack";
+    S_W,   AccessPermission:Invalid,   desc="Sharer writing back, waiting for ack";
+
+    P_AP_W, AccessPermission:Invalid,   desc="Fwded request to dir, waiting for ack";
+    P_AS_W, AccessPermission:Invalid,   desc="Fwded request to dir, waiting for ack";
+    S_AP_W, AccessPermission:Invalid,   desc="Fwded request to dir, waiting for ack";
+  }
+
+  enumeration(Event, desc="Region directory events") {
+    SendInv,        desc="Send inv message to any machine that has a region buffer";
+    SendUpgrade,    desc="Send upgrade message to any machine that has a region buffer";
+    SendDowngrade,  desc="Send downgrade message to any machine that has a region buffer";
+
+    Evict,          desc="Evict this region";
+
+    UpgradeRequest, desc="Request from r-buf for an upgrade";
+    SharedRequest,  desc="Request from r-buf for read";
+    PrivateRequest, desc="Request from r-buf for write";
+
+    InvAckCore,     desc="Ack from region buffer to order the invalidate";
+    InvAckCoreNoShare,     desc="Ack from region buffer to order the invalidate, and it does not have the region";
+    CPUPrivateAck,  desc="Ack from region buffer to order private notification";
+
+    LastAck,      desc="Done eviciting all the blocks";
+
+    StaleCleanWbRequest, desc="stale clean writeback reqeust";
+    StaleCleanWbRequestNoShare, desc="stale clean wb req from a cache which should be removed from sharers";
+    CleanWbRequest, desc="clean writeback reqeust, multiple sharers";
+    CleanWbRequest_LastSharer, desc="clean writeback reqeust, last sharer";
+    WritebackAck,   desc="Writeback Ack from region buffer";
+    DirReadyAck,   desc="Directory is ready, waiting Ack from region buffer";
+
+    TriggerInv,   desc="trigger invalidate message";
+    TriggerDowngrade, desc="trigger downgrade message";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+  structure(BoolVec, external="yes") {
+    bool at(int);
+    void resize(int);
+    void clear();
+  }
+
+  structure(Entry, desc="Region entry", interface="AbstractCacheEntry") {
+    Addr addr,        desc="Base address of this region";
+    NetDest Sharers,        desc="Set of machines that are sharing, but not owners";
+    State RegionState,      desc="Region state";
+    DataBlock DataBlk,      desc="Data for the block (always empty in region dir)";
+    MachineID Owner,        desc="Machine which owns all blocks in this region";
+    Cycles ProbeStart,        desc="Time when the first probe request was issued";
+    bool LastWriten, default="false", desc="The last time someone accessed this region, it wrote it";
+    bool LastWritenByCpu, default="false", desc="The last time the CPU accessed this region, it wrote it";
+    bool LastWritenByGpu, default="false", desc="The last time the GPU accessed this region, it wrote it";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,         desc="Transient state";
+    MachineID Owner,        desc="Machine which owns all blocks in this region";
+    NetDest Sharers,        desc="Set of machines to send evicts";
+    int NumValidBlocks,     desc="Number of blocks valid so we don't have to count a BoolVec";
+    bool AllAcksReceived,   desc="Got all necessary acks from dir";
+    CoherenceRequestType MsgType, desc="Msg type for the evicts could be inv or dwngrd";
+    Cycles ProbeRequestTime, default="Cycles(0)", desc="Start of probe request";
+    Cycles InitialRequestTime, default="Cycles(0)", desc="To forward back on out msg";
+    Addr DemandAddress, desc="Demand address from original request";
+    uint64_t probe_id,        desc="probe id for lifetime profiling";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  // Stores only region addresses
+  TBETable TBEs, template="<RegionDir_TBE>", constructor="m_number_of_TBEs";
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  int blockBits,  default="RubySystem::getBlockSizeBits()";
+  int blockBytes, default="RubySystem::getBlockSizeBytes()";
+  int regionBits, default="log2(m_blocksPerRegion)";
+
+  // Functions
+
+  MachineID getCoreMachine(MachineID rBuf, Addr address) {
+    if (machineIDToNodeID(rBuf) == cpuRegionBufferNum) {
+      return createMachineID(MachineType:CorePair, intToID(0));
+    } else if (machineIDToNodeID(rBuf) == gpuRegionBufferNum) {
+      if (noTCCdir) {
+        return mapAddressToRange(address,MachineType:TCC,
+                                    TCC_select_low_bit, TCC_select_num_bits);
+      } else {
+        return createMachineID(MachineType:TCCdir, intToID(0));
+      }
+    } else {
+      error("Unexpected region buffer number");
+    }
+  }
+
+  bool isCpuMachine(MachineID rBuf) {
+    if (machineIDToNodeID(rBuf) == cpuRegionBufferNum) {
+      return true;
+    } else if (machineIDToNodeID(rBuf) == gpuRegionBufferNum) {
+      return false;
+    } else {
+      error("Unexpected region buffer number");
+    }
+  }
+
+  bool symMigrate(Entry cache_entry) {
+      return cache_entry.LastWriten;
+  }
+
+  bool asymMigrate(Entry cache_entry, MachineID requestor) {
+      if (isCpuMachine(requestor)) {
+          return cache_entry.LastWritenByCpu;
+      } else {
+          return cache_entry.LastWritenByGpu;
+      }
+  }
+
+  int getRegionOffset(Addr addr) {
+    if (blocksPerRegion > 1) {
+      Addr offset := bitSelect(addr, blockBits, regionBits+blockBits-1);
+      int ret := addressToInt(offset);
+      assert(ret < blocksPerRegion);
+      return ret;
+    } else {
+      return 0;
+    }
+  }
+
+  Addr getRegionBase(Addr addr) {
+    return maskLowOrderBits(addr, blockBits+regionBits);
+  }
+
+  Addr getNextBlock(Addr addr) {
+    Addr a := addr;
+    makeNextStrideAddress(a, 1);
+    return a;
+  }
+
+  bool presentOrAvail(Addr addr) {
+    DPRINTF(RubySlicc, "Present? %s, avail? %s\n", cacheMemory.isTagPresent(getRegionBase(addr)), cacheMemory.cacheAvail(getRegionBase(addr)));
+    return cacheMemory.isTagPresent(getRegionBase(addr)) || cacheMemory.cacheAvail(getRegionBase(addr));
+  }
+
+  // Returns a region entry!
+  Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+    return static_cast(Entry, "pointer", cacheMemory.lookup(getRegionBase(addr)));
+  }
+
+  TBE getTBE(Addr addr), return_by_pointer="yes" {
+    return TBEs.lookup(getRegionBase(addr));
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    return getCacheEntry(getRegionBase(addr)).DataBlk;
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if (is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.RegionState;
+    }
+    return State:NP;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+        tbe.TBEState := state;
+    }
+    if (is_valid(cache_entry)) {
+        cache_entry.RegionState := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := getTBE(addr);
+    if(is_valid(tbe)) {
+      return RegionDir_State_to_permission(tbe.TBEState);
+    }
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return RegionDir_State_to_permission(cache_entry.RegionState);
+    }
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(RegionDir_State_to_permission(state));
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    functionalMemoryRead(pkt);
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    if (functionalMemoryWrite(pkt)) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      cacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      cacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      cacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      cacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return cacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return cacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+
+  out_port(requestNetwork_out, CPURequestMsg, requestToDir);
+  out_port(notifyNetwork_out, CPURequestMsg, notifyToRBuffer);
+  out_port(probeNetwork_out, NBProbeRequestMsg, probeToRBuffer);
+
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=2) {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        assert(in_msg.addr == getRegionBase(in_msg.addr));
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := getTBE(in_msg.addr);
+        DPRINTF(RubySlicc, "trigger msg: %s (%s)\n", in_msg, getRegionBase(in_msg.addr));
+        if (in_msg.Type == TriggerType:AcksComplete) {
+          assert(is_valid(tbe));
+          trigger(Event:LastAck, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == TriggerType:InvRegion) {
+          assert(is_valid(tbe));
+          trigger(Event:TriggerInv, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == TriggerType:DowngradeRegion) {
+          assert(is_valid(tbe));
+          trigger(Event:TriggerDowngrade, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unknown trigger message");
+        }
+      }
+    }
+  }
+
+  in_port(responseNetwork_in, ResponseMsg, responseFromRBuffer, rank=1) {
+    if (responseNetwork_in.isReady(clockEdge())) {
+      peek(responseNetwork_in, ResponseMsg) {
+        TBE tbe := getTBE(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+          assert(in_msg.addr == getRegionBase(in_msg.addr));
+          assert(is_valid(tbe));
+          if (in_msg.NotCached) {
+            trigger(Event:InvAckCoreNoShare, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:InvAckCore, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceResponseType:PrivateAck) {
+          assert(in_msg.addr == getRegionBase(in_msg.addr));
+          assert(is_valid(cache_entry));
+          //Fix Me...add back in: assert(cache_entry.Sharers.isElement(in_msg.Sender));
+          trigger(Event:CPUPrivateAck, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:RegionWbAck) {
+            //Fix Me...add back in: assert(cache_entry.Sharers.isElement(in_msg.Sender) == false);
+          assert(in_msg.addr == getRegionBase(in_msg.addr));
+          trigger(Event:WritebackAck, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:DirReadyAck) {
+            assert(is_valid(tbe));
+            trigger(Event:DirReadyAck, getRegionBase(in_msg.addr), cache_entry, tbe);
+        } else {
+          error("Invalid response type");
+        }
+      }
+    }
+  }
+
+  // In from cores
+  // NOTE: We get the cache / TBE entry based on the region address,
+  //       but pass the block address to the actions
+  in_port(requestNetwork_in, CPURequestMsg, requestFromRegBuf, rank=0) {
+    if (requestNetwork_in.isReady(clockEdge())) {
+      peek(requestNetwork_in, CPURequestMsg) {
+        //assert(in_msg.addr == getRegionBase(in_msg.addr));
+        Addr address := getRegionBase(in_msg.addr);
+        DPRINTF(RubySlicc, "Got %s, base %s\n", in_msg.addr, address);
+        if (presentOrAvail(address)) {
+          TBE tbe := getTBE(address);
+          Entry cache_entry := getCacheEntry(address);
+          if (in_msg.Type == CoherenceRequestType:PrivateRequest) {
+            if (is_valid(cache_entry) && (cache_entry.Owner != in_msg.Requestor ||
+                getState(tbe, cache_entry, address) == State:S)) {
+              trigger(Event:SendInv, address, cache_entry, tbe);
+            } else {
+              trigger(Event:PrivateRequest, address, cache_entry, tbe);
+            }
+          } else if (in_msg.Type == CoherenceRequestType:SharedRequest) {
+            if (is_invalid(cache_entry)) {
+              // If no one has ever requested this region give private permissions
+              trigger(Event:PrivateRequest, address, cache_entry, tbe);
+            } else {
+                if (always_migrate ||
+                    (sym_migrate && symMigrate(cache_entry)) ||
+                    (asym_migrate && asymMigrate(cache_entry, in_msg.Requestor))) {
+                    if (cache_entry.Sharers.count() == 1 &&
+                        cache_entry.Sharers.isElement(in_msg.Requestor)) {
+                        trigger(Event:UpgradeRequest, address, cache_entry, tbe);
+                    } else {
+                        trigger(Event:SendInv, address, cache_entry, tbe);
+                    }
+                } else { // don't migrate
+                    if(cache_entry.Sharers.isElement(in_msg.Requestor) ||
+                       getState(tbe, cache_entry, address) == State:S) {
+                        trigger(Event:SharedRequest, address, cache_entry, tbe);
+                    } else {
+                        trigger(Event:SendDowngrade, address, cache_entry, tbe);
+                    }
+                }
+            }
+          } else if (in_msg.Type == CoherenceRequestType:UpgradeRequest) {
+            if (is_invalid(cache_entry)) {
+              trigger(Event:PrivateRequest, address, cache_entry, tbe);
+            } else if (cache_entry.Sharers.count() == 1 && cache_entry.Sharers.isElement(in_msg.Requestor)) {
+              trigger(Event:UpgradeRequest, address, cache_entry, tbe);
+            } else {
+              trigger(Event:SendUpgrade, address, cache_entry, tbe);
+            }
+          } else if (in_msg.Type == CoherenceRequestType:CleanWbRequest) {
+            if (is_invalid(cache_entry) || cache_entry.Sharers.isElement(in_msg.Requestor) == false) {
+              trigger(Event:StaleCleanWbRequest, address, cache_entry, tbe);
+            } else {
+                DPRINTF(RubySlicc, "wb address %s(%s) owner %s sharers %s requestor %s %d %d\n", in_msg.addr, getRegionBase(in_msg.addr), cache_entry.Owner, cache_entry.Sharers, in_msg.Requestor, cache_entry.Sharers.isElement(in_msg.Requestor), cache_entry.Sharers.count());
+                if (cache_entry.Sharers.isElement(in_msg.Requestor) && cache_entry.Sharers.count() == 1) {
+                    DPRINTF(RubySlicc, "last wb\n");
+                    trigger(Event:CleanWbRequest_LastSharer, address, cache_entry, tbe);
+                } else {
+                    DPRINTF(RubySlicc, "clean wb\n");
+                    trigger(Event:CleanWbRequest, address, cache_entry, tbe);
+                }
+            }
+          } else {
+            error("unknown region dir request type");
+          }
+        } else {
+          Addr victim := cacheMemory.cacheProbe(getRegionBase(in_msg.addr));
+          TBE victim_tbe := getTBE(victim);
+          Entry victim_entry := getCacheEntry(victim);
+          DPRINTF(RubySlicc, "Evicting address %s for new region at address %s(%s)\n", victim, in_msg.addr, getRegionBase(in_msg.addr));
+          assert(is_valid(victim_entry));
+          trigger(Event:Evict, victim, victim_entry, victim_tbe);
+        }
+      }
+    }
+  }
+
+  // Actions
+
+  action(f_fwdReqToDir, "f", desc="Forward CPU request to directory") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) {
+        out_msg.addr := in_msg.addr;  // This is the block address. "address" is the region address
+        out_msg.Type := in_msg.OriginalType;
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.Dirty := in_msg.Dirty;
+        out_msg.Requestor := getCoreMachine(in_msg.Requestor,address);
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Destination.add(map_Address_to_Directory(in_msg.addr));
+        out_msg.Shared := in_msg.Shared;
+        out_msg.MessageSize := in_msg.MessageSize;
+        out_msg.Private := in_msg.Private;
+        out_msg.NoAckNeeded := true;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ProbeRequestStartTime := curCycle();
+        out_msg.DemandRequest := true;
+        if (is_valid(cache_entry) && getState(tbe, cache_entry, address) != State:S) {
+            out_msg.Acks := cache_entry.Sharers.count();
+        } else {
+            out_msg.Acks := 0;
+        }
+      }
+    }
+  }
+
+  action(f_fwdReqToDirShared, "fs", desc="Forward CPU request to directory (shared)") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) {
+        out_msg.addr := in_msg.addr;  // This is the block address. "address" is the region address
+        out_msg.Type := in_msg.OriginalType;
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.Dirty := in_msg.Dirty;
+        out_msg.Requestor := getCoreMachine(in_msg.Requestor,address);
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Destination.add(map_Address_to_Directory(in_msg.addr));
+        out_msg.Shared := in_msg.Shared;
+        out_msg.MessageSize := in_msg.MessageSize;
+        out_msg.Private := in_msg.Private;
+        out_msg.NoAckNeeded := true;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ProbeRequestStartTime := curCycle();
+        out_msg.DemandRequest := true;
+        out_msg.ForceShared := true;
+        if (is_valid(cache_entry) && getState(tbe, cache_entry, address) != State:S) {
+            out_msg.Acks := cache_entry.Sharers.count();
+        } else {
+            out_msg.Acks := 0;
+        }
+      }
+    }
+  }
+
+  action(f_fwdReqToDirWithAck, "fa", desc="Forward CPU request to directory with ack request") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) {
+        out_msg.addr := in_msg.addr; // This is the block address. "address" is the region address
+        out_msg.Type := in_msg.OriginalType;
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.Dirty := in_msg.Dirty;
+        out_msg.Requestor := getCoreMachine(in_msg.Requestor,address);
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Destination.add(map_Address_to_Directory(in_msg.addr));
+        out_msg.Shared := in_msg.Shared;
+        out_msg.MessageSize := in_msg.MessageSize;
+        out_msg.Private := in_msg.Private;
+        out_msg.NoAckNeeded := false;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ProbeRequestStartTime := curCycle();
+        out_msg.DemandRequest := true;
+        if (is_valid(cache_entry)) {
+            out_msg.Acks := cache_entry.Sharers.count();
+            // Don't need an ack from the requestor!
+            if (cache_entry.Sharers.isElement(in_msg.Requestor)) {
+                out_msg.Acks := out_msg.Acks - 1;
+            }
+        } else {
+            out_msg.Acks := 0;
+        }
+      }
+    }
+  }
+
+  action(f_fwdReqToDirWithAckShared, "fas", desc="Forward CPU request to directory with ack request") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) {
+        out_msg.addr := in_msg.addr; // This is the block address. "address" is the region address
+        out_msg.Type := in_msg.OriginalType;
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.Dirty := in_msg.Dirty;
+        out_msg.Requestor := getCoreMachine(in_msg.Requestor,address);
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Destination.add(map_Address_to_Directory(in_msg.addr));
+        out_msg.Shared := in_msg.Shared;
+        out_msg.MessageSize := in_msg.MessageSize;
+        out_msg.Private := in_msg.Private;
+        out_msg.NoAckNeeded := false;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ProbeRequestStartTime := curCycle();
+        out_msg.DemandRequest := true;
+        out_msg.ForceShared := true;
+        if (is_valid(cache_entry)) {
+            out_msg.Acks := cache_entry.Sharers.count();
+            // Don't need an ack from the requestor!
+            if (cache_entry.Sharers.isElement(in_msg.Requestor)) {
+                out_msg.Acks := out_msg.Acks - 1;
+            }
+        } else {
+            out_msg.Acks := 0;
+        }
+      }
+    }
+  }
+
+  action(a_allocateRegionEntry, "a", desc="Allocate a new entry") {
+    set_cache_entry(cacheMemory.allocate(getRegionBase(address), new Entry));
+    peek(requestNetwork_in, CPURequestMsg) {
+      APPEND_TRANSITION_COMMENT(in_msg.Requestor);
+    }
+  }
+
+  action(d_deallocateRegionEntry, "d", desc="Deallocate region entry") {
+    cacheMemory.deallocate(getRegionBase(address));
+    unset_cache_entry();
+  }
+
+  action(ra_receiveAck, "ra", desc="Mark TBE entry as received this ack") {
+    //assert(tbe.ValidBlocks.at(getRegionOffset(address)));
+    DPRINTF(RubySlicc, "received ack for %s reg: %s\n", address, getRegionBase(address));
+    tbe.NumValidBlocks := tbe.NumValidBlocks - 1;
+    assert(tbe.NumValidBlocks >= 0);
+    if (tbe.NumValidBlocks == 0) {
+      tbe.AllAcksReceived := true;
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.Type := TriggerType:AcksComplete;
+        out_msg.addr := address;
+      }
+    }
+    APPEND_TRANSITION_COMMENT(getRegionBase(address));
+    APPEND_TRANSITION_COMMENT(" Acks left receive ");
+    APPEND_TRANSITION_COMMENT(tbe.NumValidBlocks);
+  }
+
+  action(ca_checkAcks, "ca", desc="Check to see if we need more acks") {
+    if (tbe.NumValidBlocks == 0) {
+      tbe.AllAcksReceived := true;
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.Type := TriggerType:AcksComplete;
+        out_msg.addr := address;
+      }
+    }
+  }
+
+  action(ti_triggerInv, "ti", desc="") {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+          out_msg.Type := TriggerType:InvRegion;
+          out_msg.addr := address;
+      }
+  }
+
+  action(td_triggerDowngrade, "td", desc="") {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.Type := TriggerType:DowngradeRegion;
+        out_msg.addr := address;
+      }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    TBEs.allocate(getRegionBase(address));
+    set_tbe(getTBE(address));
+    if (is_valid(cache_entry)) {
+      tbe.Owner := cache_entry.Owner;
+      tbe.Sharers := cache_entry.Sharers;
+      tbe.AllAcksReceived := true; // assume no acks are required
+    }
+    tbe.ProbeRequestTime := curCycle();
+    peek(requestNetwork_in, CPURequestMsg) {
+      tbe.InitialRequestTime := in_msg.InitialRequestTime;
+      tbe.DemandAddress := in_msg.addr;
+    }
+    APPEND_TRANSITION_COMMENT(getRegionBase(address));
+    APPEND_TRANSITION_COMMENT(" Acks left ");
+    APPEND_TRANSITION_COMMENT(tbe.NumValidBlocks);
+    APPEND_TRANSITION_COMMENT(" Owner, ");
+    APPEND_TRANSITION_COMMENT(tbe.Owner);
+    APPEND_TRANSITION_COMMENT(" sharers, ");
+    APPEND_TRANSITION_COMMENT(tbe.Sharers);
+  }
+
+  action(ss_setSharers, "ss", desc="Add requestor to sharers") {
+    peek(requestNetwork_in, CPURequestMsg) {
+        cache_entry.Sharers.add(in_msg.Requestor);
+        APPEND_TRANSITION_COMMENT(cache_entry.Sharers);
+    }
+  }
+
+  action(rs_removeSharer, "rs", desc="Remove requestor to sharers") {
+    peek(requestNetwork_in, CPURequestMsg) {
+        cache_entry.Sharers.remove(in_msg.Requestor);
+        APPEND_TRANSITION_COMMENT(" removing ");
+        APPEND_TRANSITION_COMMENT(in_msg.Requestor);
+        APPEND_TRANSITION_COMMENT(" sharers ");
+        APPEND_TRANSITION_COMMENT(cache_entry.Sharers);
+    }
+  }
+
+  action(rsr_removeSharerResponse, "rsr", desc="Remove requestor to sharers") {
+    peek(responseNetwork_in, ResponseMsg) {
+        cache_entry.Sharers.remove(in_msg.Sender);
+        APPEND_TRANSITION_COMMENT(cache_entry.Sharers);
+    }
+  }
+
+  action(cs_clearSharers, "cs", desc="Add requestor to sharers") {
+    cache_entry.Sharers.clear();
+  }
+
+  action(so_setOwner, "so", desc="Set the owner to the requestor") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      cache_entry.Owner := in_msg.Requestor;
+      APPEND_TRANSITION_COMMENT(" Owner now: ");
+      APPEND_TRANSITION_COMMENT(cache_entry.Owner);
+    }
+  }
+
+  action(rr_removeRequestorFromTBE, "rr", desc="Remove requestor from TBE sharers") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      tbe.Sharers.remove(in_msg.Requestor);
+    }
+  }
+
+  action(ur_updateDirtyStatusOnRequest, "ur", desc="Update dirty status on demand request") {
+      peek(requestNetwork_in, CPURequestMsg) {
+          if (is_valid(cache_entry)) {
+              if ((in_msg.Type == CoherenceRequestType:SharedRequest) &&
+                  (cache_entry.Sharers.isElement(in_msg.Requestor) == false)) {
+                  cache_entry.LastWriten := false;
+                  if (isCpuMachine(in_msg.Requestor)) {
+                      cache_entry.LastWritenByCpu := false;
+                  } else {
+                      cache_entry.LastWritenByGpu := false;
+                  }
+              } else if ((in_msg.Type == CoherenceRequestType:PrivateRequest) ||
+                         (in_msg.Type == CoherenceRequestType:UpgradeRequest)) {
+                  cache_entry.LastWriten := true;
+                  if (isCpuMachine(in_msg.Requestor)) {
+                      cache_entry.LastWritenByCpu := true;
+                  } else {
+                      cache_entry.LastWritenByGpu := true;
+                  }
+              }
+          }
+      }
+  }
+
+  action(ud_updateDirtyStatusWithWb, "ud", desc="Update dirty status on writeback") {
+      peek(requestNetwork_in, CPURequestMsg) {
+          if (is_valid(cache_entry) && in_msg.Dirty) {
+              cache_entry.LastWriten := true;
+              if (isCpuMachine(in_msg.Requestor)) {
+                  cache_entry.LastWritenByCpu := true;
+              } else {
+                  cache_entry.LastWritenByGpu := true;
+              }
+          }
+      }
+  }
+
+  action(sns_setNumAcksSharers, "sns", desc="Set number of acks to one per shared region buffer") {
+    assert(is_valid(tbe));
+    assert(is_valid(cache_entry));
+    tbe.NumValidBlocks := tbe.Sharers.count();
+  }
+
+  action(sno_setNumAcksOne, "sno", desc="Set number of acks to one per shared region buffer") {
+    assert(is_valid(tbe));
+    assert(is_valid(cache_entry));
+    tbe.NumValidBlocks := 1;
+  }
+
+  action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+    TBEs.deallocate(getRegionBase(address));
+    APPEND_TRANSITION_COMMENT(" reg: ");
+    APPEND_TRANSITION_COMMENT(getRegionBase(address));
+    unset_tbe();
+  }
+
+  action(wb_sendWbNotice, "wb", desc="Send notice to cache that writeback is acknowledged") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+        out_msg.addr := getRegionBase(address);
+        out_msg.Type := CoherenceRequestType:WbNotify;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Requestor := machineID;
+        out_msg.MessageSize := MessageSizeType:Request_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+      }
+    }
+  }
+
+  action(wbn_sendWbNoticeNoAck, "wbn", desc="Send notice to cache that writeback is acknowledged (no ack needed)") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+        out_msg.addr := getRegionBase(address);
+        out_msg.Type := CoherenceRequestType:WbNotify;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Requestor := machineID;
+        out_msg.MessageSize := MessageSizeType:Request_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.NoAckNeeded := true;
+      }
+    }
+  }
+
+  action(b_sendPrivateNotice, "b", desc="Send notice to private cache that it has private access") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+        out_msg.addr := getRegionBase(address);
+        out_msg.Type := CoherenceRequestType:PrivateNotify;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Requestor := machineID;
+        out_msg.MessageSize := MessageSizeType:Request_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+      }
+    }
+  }
+
+  action(bs_sendSharedNotice, "bs", desc="Send notice to private cache that it has private access") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+        out_msg.addr := getRegionBase(address);
+        out_msg.Type := CoherenceRequestType:SharedNotify;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Requestor := machineID;
+        out_msg.MessageSize := MessageSizeType:Request_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+      }
+    }
+  }
+
+  action(c_sendSharedNoticeToOrigReq, "c", desc="Send notice to private cache that it has shared access") {
+    assert(is_valid(tbe));
+    enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+      out_msg.addr := getRegionBase(address);
+      out_msg.Type := CoherenceRequestType:SharedNotify;
+      out_msg.Destination.add(tbe.Owner);
+      out_msg.Requestor := machineID;
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.ProbeRequestStartTime := tbe.ProbeRequestTime;
+      out_msg.InitialRequestTime := tbe.InitialRequestTime;
+      APPEND_TRANSITION_COMMENT("dest: ");
+      APPEND_TRANSITION_COMMENT(out_msg.Destination);
+    }
+  }
+
+  action(sp_sendPrivateNoticeToOrigReq, "sp", desc="Send notice to private cache that it has private access") {
+    assert(is_valid(tbe));
+    enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+      out_msg.addr := getRegionBase(address);
+      out_msg.Type := CoherenceRequestType:PrivateNotify;
+      out_msg.Destination.add(tbe.Owner);
+      out_msg.Requestor := machineID;
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.ProbeRequestStartTime := tbe.ProbeRequestTime;
+      out_msg.InitialRequestTime := tbe.InitialRequestTime;
+      APPEND_TRANSITION_COMMENT("dest: ");
+      APPEND_TRANSITION_COMMENT(out_msg.Destination);
+    }
+  }
+
+  action(i_RegionInvNotify, "i", desc="Send notice to private cache that it no longer has private access") {
+      enqueue(probeNetwork_out, NBProbeRequestMsg, 1) {
+          out_msg.addr := address;
+          out_msg.DemandAddress := tbe.DemandAddress;
+          //out_msg.Requestor := tbe.Requestor;
+          out_msg.Requestor := machineID;
+          out_msg.Type := ProbeRequestType:PrbInv;
+          //Fix me: assert(tbe.Sharers.count() > 0);
+          out_msg.DemandRequest := true;
+          out_msg.Destination := tbe.Sharers;
+          out_msg.MessageSize := MessageSizeType:Request_Control;
+          APPEND_TRANSITION_COMMENT("dest: ");
+          APPEND_TRANSITION_COMMENT(out_msg.Destination);
+      }
+  }
+
+  action(i0_RegionInvNotifyDemand0, "i0", desc="Send notice to private cache that it no longer has private access") {
+      enqueue(probeNetwork_out, NBProbeRequestMsg, 1) {
+          out_msg.addr := address;
+          // Demand address should default to 0 -> out_msg.DemandAddress := 0;
+          out_msg.Requestor := machineID;
+          out_msg.Type := ProbeRequestType:PrbInv;
+          out_msg.Destination := tbe.Sharers;
+          out_msg.MessageSize := MessageSizeType:Request_Control;
+          APPEND_TRANSITION_COMMENT("dest: ");
+          APPEND_TRANSITION_COMMENT(out_msg.Destination);
+      }
+  }
+
+  action(rd_RegionDowngrade, "rd", desc="Send notice to private cache that it only has shared access") {
+        enqueue(probeNetwork_out, NBProbeRequestMsg, 1) {
+            out_msg.addr := address;
+            out_msg.DemandAddress := tbe.DemandAddress;
+            out_msg.Requestor := machineID;
+            out_msg.Type := ProbeRequestType:PrbDowngrade;
+            out_msg.DemandRequest := true;
+            out_msg.Destination := tbe.Sharers;
+            out_msg.MessageSize := MessageSizeType:Request_Control;
+            APPEND_TRANSITION_COMMENT("dest: ");
+            APPEND_TRANSITION_COMMENT(out_msg.Destination);
+        }
+  }
+
+  action(p_popRequestQueue, "p", desc="Pop the request queue") {
+    requestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pt_popTriggerQueue, "pt", desc="Pop the trigger queue") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="Pop the response queue") {
+    responseNetwork_in.dequeue(clockEdge());
+  }
+
+  action(s_stallAndWaitRequest, "s", desc="Stall and wait on the region address") {
+    Addr regAddr := getRegionBase(address);
+    stall_and_wait(requestNetwork_in, regAddr);
+  }
+
+  action(w_wakeUpRegionDependents, "w", desc="Wake up any requests waiting for this region") {
+    wakeUpBuffers(getRegionBase(address));
+  }
+
+  action(wa_wakeUpAllDependents, "wa", desc="Wake up any requests waiting for this region") {
+    wakeUpAllBuffers();
+  }
+
+  action(zz_recycleRequestQueue, "\z", desc="...") {
+    requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(z_stall, "z", desc="stall request queue") {
+    // fake state
+  }
+
+  action(mru_setMRU, "mru", desc="set MRU") {
+    cacheMemory.setMRU(address);
+  }
+
+ // Transistions
+
+  transition({NP_P, P_P, NP_S, S_S, S_P, P_S, P_NP, S_AP, P_AS, P_AP, SP_NP_W, S_W, P_AP_W, P_AS_W, S_AP_W}, {PrivateRequest, SharedRequest, UpgradeRequest, SendInv, SendUpgrade, SendDowngrade, CleanWbRequest, CleanWbRequest_LastSharer, StaleCleanWbRequest}) {
+    s_stallAndWaitRequest
+  }
+
+  transition({NP_P, P_P, NP_S, S_S, S_P, S_W, P_S, P_NP, S_AP, P_AS, P_AP, P_AP_W, P_AS_W, S_AP_W}, Evict) {
+    zz_recycleRequestQueue;
+  }
+
+  transition(NP, {PrivateRequest, SendUpgrade}, NP_P) {TagArrayRead, TagArrayWrite} {
+    a_allocateRegionEntry;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDir;
+    b_sendPrivateNotice;
+    so_setOwner;
+    ss_setSharers;
+    t_allocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition(P, {PrivateRequest, UpgradeRequest}, P_P) {TagArrayRead} {
+    mru_setMRU;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDir;
+    b_sendPrivateNotice;
+    t_allocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition({NP_P, P_P}, CPUPrivateAck, P) {
+    dt_deallocateTBE;
+    w_wakeUpRegionDependents;
+    pr_popResponseQueue;
+  }
+
+  transition({NP, P, S}, StaleCleanWbRequest) {TagArrayRead, TagArrayWrite} {
+      wbn_sendWbNoticeNoAck;
+      ud_updateDirtyStatusWithWb;
+      p_popRequestQueue;
+  }
+
+  transition(NP, SharedRequest, NP_S) {TagArrayRead, TagArrayWrite} {
+    a_allocateRegionEntry;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDirShared;
+    bs_sendSharedNotice;
+    so_setOwner;
+    ss_setSharers;
+    t_allocateTBE;
+    p_popRequestQueue;
+  }
+
+  // Could probably do this in parallel with other shared requests
+  transition(S, SharedRequest, S_S) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDirShared;
+    bs_sendSharedNotice;
+    ss_setSharers;
+    t_allocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition({P, S}, CleanWbRequest_LastSharer, SP_NP_W) {TagArrayRead, TagArrayWrite} {
+    ud_updateDirtyStatusWithWb;
+    wb_sendWbNotice;
+    rs_removeSharer;
+    t_allocateTBE;
+    d_deallocateRegionEntry;
+    p_popRequestQueue;
+  }
+
+  transition(S, CleanWbRequest, S_W) {TagArrayRead, TagArrayWrite} {
+    ud_updateDirtyStatusWithWb;
+    wb_sendWbNotice;
+    rs_removeSharer;
+    t_allocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition(SP_NP_W, WritebackAck, NP) {
+    dt_deallocateTBE;
+    w_wakeUpRegionDependents;
+    pr_popResponseQueue;
+  }
+
+  transition(S_W, WritebackAck, S) {
+    dt_deallocateTBE;
+    w_wakeUpRegionDependents;
+    pr_popResponseQueue;
+  }
+
+  transition({NP_S, S_S}, CPUPrivateAck, S) {
+    dt_deallocateTBE;
+    w_wakeUpRegionDependents;
+    pr_popResponseQueue;
+  }
+
+  transition(S, UpgradeRequest, S_P) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDir;
+    b_sendPrivateNotice;
+    so_setOwner;
+    t_allocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition(S_P, CPUPrivateAck, P) {
+    dt_deallocateTBE;
+    w_wakeUpRegionDependents;
+    pr_popResponseQueue;
+  }
+
+  transition(P, SendInv, P_AP_W) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDirWithAck;
+    so_setOwner;
+    t_allocateTBE;
+    rr_removeRequestorFromTBE;
+    sns_setNumAcksSharers;
+    cs_clearSharers;
+    ss_setSharers;
+    //i_RegionInvNotify;
+    p_popRequestQueue;
+  }
+
+  transition({P_AP_W, S_AP_W}, DirReadyAck) {
+      ti_triggerInv;
+      pr_popResponseQueue;
+  }
+
+  transition(P_AS_W, DirReadyAck) {
+      td_triggerDowngrade;
+      pr_popResponseQueue;
+  }
+
+  transition(P_AS_W, TriggerDowngrade, P_AS) {
+      rd_RegionDowngrade;
+      pt_popTriggerQueue;
+  }
+
+  transition(P_AP_W, TriggerInv, P_AP) {
+      i_RegionInvNotify;
+      pt_popTriggerQueue;
+  }
+
+  transition(S_AP_W, TriggerInv, S_AP) {
+      i_RegionInvNotify;
+      pt_popTriggerQueue;
+  }
+
+  transition(P, SendUpgrade, P_AP_W) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDirWithAck;
+    so_setOwner;
+    t_allocateTBE;
+    rr_removeRequestorFromTBE;
+    sns_setNumAcksSharers;
+    cs_clearSharers;
+    ss_setSharers;
+    p_popRequestQueue;
+  }
+
+  transition(P, Evict, P_NP) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    sns_setNumAcksSharers;
+    i0_RegionInvNotifyDemand0;
+    d_deallocateRegionEntry;
+  }
+
+  transition(S, SendInv, P_AP_W) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDirWithAck;
+    so_setOwner;
+    t_allocateTBE;
+    rr_removeRequestorFromTBE;
+    sns_setNumAcksSharers;
+    cs_clearSharers;
+    ss_setSharers;
+    p_popRequestQueue;
+  }
+
+  transition(S, Evict, P_NP) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    sns_setNumAcksSharers;
+    i0_RegionInvNotifyDemand0;
+    d_deallocateRegionEntry;
+  }
+
+  transition(P_NP, LastAck, NP) {
+    dt_deallocateTBE;
+    wa_wakeUpAllDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(S, SendUpgrade, S_AP_W) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDirWithAck;
+    so_setOwner;
+    t_allocateTBE;
+    rr_removeRequestorFromTBE;
+    sns_setNumAcksSharers;
+    cs_clearSharers;
+    ss_setSharers;
+    p_popRequestQueue;
+  }
+
+  transition(S_AP, LastAck, S_P) {
+    sp_sendPrivateNoticeToOrigReq;
+    pt_popTriggerQueue;
+  }
+
+  transition(P_AP, LastAck, P_P) {
+    sp_sendPrivateNoticeToOrigReq;
+    pt_popTriggerQueue;
+  }
+
+  transition(P, SendDowngrade, P_AS_W) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDirWithAckShared;
+    so_setOwner;
+    t_allocateTBE;
+    sns_setNumAcksSharers;
+    ss_setSharers; //why do we set the sharers before sending the downgrade?  Are we sending a downgrade to the requestor?
+    p_popRequestQueue;
+  }
+
+  transition(P_AS, LastAck, P_S) {
+    c_sendSharedNoticeToOrigReq;
+    pt_popTriggerQueue;
+  }
+
+  transition(P_S, CPUPrivateAck, S) {
+    dt_deallocateTBE;
+    w_wakeUpRegionDependents;
+    pr_popResponseQueue;
+  }
+
+  transition({P_NP, P_AS, S_AP, P_AP}, InvAckCore) {} {
+    ra_receiveAck;
+    pr_popResponseQueue;
+  }
+
+  transition({P_NP, S_AP, P_AP}, InvAckCoreNoShare) {} {
+    ra_receiveAck;
+    pr_popResponseQueue;
+  }
+
+  transition(P_AS, InvAckCoreNoShare) {} {
+    ra_receiveAck;
+    rsr_removeSharerResponse;
+    pr_popResponseQueue;
+  }
+
+}
+
+
diff --git a/src/mem/protocol/MOESI_AMD_Base-dir.sm b/src/mem/protocol/MOESI_AMD_Base-dir.sm
new file mode 100644
index 000000000..52cefda66
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-dir.sm
@@ -0,0 +1,1137 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:Directory, "AMD Baseline protocol")
+: DirectoryMemory * directory;
+  CacheMemory * L3CacheMemory;
+  Cycles response_latency := 5;
+  Cycles l3_hit_latency := 50;
+  bool noTCCdir := "False";
+  bool CPUonly := "False";
+  int TCC_select_num_bits;
+  bool useL3OnWT := "False";
+  Cycles to_memory_controller_latency := 1;
+
+  // From the Cores
+  MessageBuffer * requestFromCores, network="From", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseFromCores, network="From", virtual_network="2", vnet_type="response";
+  MessageBuffer * unblockFromCores, network="From", virtual_network="4", vnet_type="unblock";
+
+  MessageBuffer * probeToCore, network="To", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseToCore, network="To", virtual_network="2", vnet_type="response";
+
+  MessageBuffer * triggerQueue;
+  MessageBuffer * L3triggerQueue;
+  MessageBuffer * responseFromMemory;
+{
+  // STATES
+  state_declaration(State, desc="Directory states", default="Directory_State_U") {
+    U, AccessPermission:Backing_Store,                 desc="unblocked";
+    BL, AccessPermission:Busy,                  desc="got L3 WB request";
+    // BL is Busy because it's possible for the data only to be in the network
+    // in the WB, L3 has sent it and gone on with its business in possibly I
+    // state.
+    BS_M, AccessPermission:Backing_Store,                 desc="blocked waiting for memory";
+    BM_M, AccessPermission:Backing_Store,                 desc="blocked waiting for memory";
+    B_M, AccessPermission:Backing_Store,                 desc="blocked waiting for memory";
+    BP, AccessPermission:Backing_Store,                 desc="blocked waiting for probes, no need for memory";
+    BS_PM, AccessPermission:Backing_Store,                desc="blocked waiting for probes and Memory";
+    BM_PM, AccessPermission:Backing_Store,                desc="blocked waiting for probes and Memory";
+    B_PM, AccessPermission:Backing_Store,                desc="blocked waiting for probes and Memory";
+    BS_Pm, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    BM_Pm, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    B_Pm, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    B, AccessPermission:Backing_Store,                  desc="sent response, Blocked til ack";
+  }
+
+  // Events
+  enumeration(Event, desc="Directory events") {
+    // CPU requests
+    RdBlkS,             desc="...";
+    RdBlkM,             desc="...";
+    RdBlk,              desc="...";
+    CtoD,               desc="...";
+    WriteThrough,       desc="WriteThrough Message";
+    Atomic,             desc="Atomic Message";
+
+    // writebacks
+    VicDirty,           desc="...";
+    VicClean,           desc="...";
+    CPUData,            desc="WB data from CPU";
+    StaleWB,         desc="Notification that WB has been superceded by a probe";
+
+    // probe responses
+    CPUPrbResp,            desc="Probe Response Msg";
+
+    ProbeAcksComplete,  desc="Probe Acks Complete";
+
+    L3Hit,              desc="Hit in L3 return data to core";
+
+    // Memory Controller
+    MemData, desc="Fetched data from memory arrives";
+    WBAck, desc="Writeback Ack from memory arrives";
+
+    CoreUnblock,            desc="Core received data, unblock";
+    UnblockWriteThrough,    desc="Unblock because of writethrough request finishing";
+
+    StaleVicDirty,        desc="Core invalidated before VicDirty processed";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    L3DataArrayRead,    desc="Read the data array";
+    L3DataArrayWrite,   desc="Write the data array";
+    L3TagArrayRead,     desc="Read the data array";
+    L3TagArrayWrite,    desc="Write the data array";
+  }
+
+  // TYPES
+
+  // DirectoryEntry
+  structure(Entry, desc="...", interface="AbstractEntry") {
+    State DirectoryState,          desc="Directory state";
+    DataBlock DataBlk,             desc="data for the block";
+    NetDest VicDirtyIgnore,  desc="VicDirty coming from whom to ignore";
+  }
+
+  structure(CacheEntry, desc="...", interface="AbstractCacheEntry") {
+    DataBlock DataBlk,          desc="data for the block";
+    MachineID LastSender,       desc="Mach which this block came from";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,     desc="Transient state";
+    DataBlock DataBlk,  desc="data for the block";
+    bool Dirty,         desc="Is the data dirty?";
+    int NumPendingAcks,        desc="num acks expected";
+    MachineID OriginalRequestor,        desc="Original Requestor";
+    MachineID WTRequestor,        desc="WT Requestor";
+    bool Cached,        desc="data hit in Cache";
+    bool MemData,       desc="Got MemData?",default="false";
+    bool wtData,       desc="Got write through data?",default="false";
+    bool atomicData,   desc="Got Atomic op?",default="false";
+    Cycles InitialRequestTime, desc="...";
+    Cycles ForwardRequestTime, desc="...";
+    Cycles ProbeRequestStartTime, desc="...";
+    MachineID LastSender, desc="Mach which this block came from";
+    bool L3Hit, default="false", desc="Was this an L3 hit?";
+    uint64_t probe_id,        desc="probe id for lifetime profiling";
+    WriteMask writeMask,    desc="outstanding write through mask";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
+
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_tbe(TBE a);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  Entry getDirectoryEntry(Addr addr), return_by_pointer="yes" {
+    Entry dir_entry := static_cast(Entry, "pointer", directory.lookup(addr));
+
+    if (is_valid(dir_entry)) {
+      return dir_entry;
+    }
+
+    dir_entry :=  static_cast(Entry, "pointer",
+                              directory.allocate(addr, new Entry));
+    return dir_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if (is_valid(tbe) && tbe.MemData) {
+      DPRINTF(RubySlicc, "Returning DataBlk from TBE %s:%s\n", addr, tbe);
+      return tbe.DataBlk;
+    }
+    DPRINTF(RubySlicc, "Returning DataBlk from Dir %s:%s\n", addr, getDirectoryEntry(addr));
+    return getDirectoryEntry(addr).DataBlk;
+  }
+
+  State getState(TBE tbe, CacheEntry entry, Addr addr) {
+    return getDirectoryEntry(addr).DirectoryState;
+  }
+
+  void setState(TBE tbe, CacheEntry entry, Addr addr, State state) {
+    getDirectoryEntry(addr).DirectoryState := state;
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes
+        + functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    // For this Directory, all permissions are just tracked in Directory, since
+    // it's not possible to have something in TBE but not Dir, just keep track
+    // of state all in one place.
+    if (directory.isPresent(addr)) {
+      return Directory_State_to_permission(getDirectoryEntry(addr).DirectoryState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(CacheEntry entry, Addr addr, State state) {
+    getDirectoryEntry(addr).changePermission(Directory_State_to_permission(state));
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L3DataArrayRead) {
+        L3CacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L3DataArrayWrite) {
+        L3CacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L3TagArrayRead) {
+        L3CacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L3TagArrayWrite) {
+        L3CacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L3DataArrayRead) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L3DataArrayWrite) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L3TagArrayRead) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L3TagArrayWrite) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  // ** OUT_PORTS **
+  out_port(probeNetwork_out, NBProbeRequestMsg, probeToCore);
+  out_port(responseNetwork_out, ResponseMsg, responseToCore);
+
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+  out_port(L3TriggerQueue_out, TriggerMsg, L3triggerQueue);
+
+  // ** IN_PORTS **
+
+  // Trigger Queue
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=5) {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == TriggerType:AcksComplete) {
+          trigger(Event:ProbeAcksComplete, in_msg.addr, entry, tbe);
+        }else if (in_msg.Type == TriggerType:UnblockWriteThrough) {
+          trigger(Event:UnblockWriteThrough, in_msg.addr, entry, tbe);
+        } else {
+          error("Unknown trigger msg");
+        }
+      }
+    }
+  }
+
+  in_port(L3TriggerQueue_in, TriggerMsg, L3triggerQueue, rank=4) {
+    if (L3TriggerQueue_in.isReady(clockEdge())) {
+      peek(L3TriggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == TriggerType:L3Hit) {
+          trigger(Event:L3Hit, in_msg.addr, entry, tbe);
+        } else {
+          error("Unknown trigger msg");
+        }
+      }
+    }
+  }
+
+  // Unblock Network
+  in_port(unblockNetwork_in, UnblockMsg, unblockFromCores, rank=3) {
+    if (unblockNetwork_in.isReady(clockEdge())) {
+      peek(unblockNetwork_in, UnblockMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        trigger(Event:CoreUnblock, in_msg.addr, entry, tbe);
+      }
+    }
+  }
+
+  // Core response network
+  in_port(responseNetwork_in, ResponseMsg, responseFromCores, rank=2) {
+    if (responseNetwork_in.isReady(clockEdge())) {
+      peek(responseNetwork_in, ResponseMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+          trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:CPUData) {
+          trigger(Event:CPUData, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
+            trigger(Event:StaleWB, in_msg.addr, entry, tbe);
+        } else {
+          error("Unexpected response type");
+        }
+      }
+    }
+  }
+
+  // off-chip memory request/response is done
+  in_port(memQueue_in, MemoryMsg, responseFromMemory, rank=1) {
+    if (memQueue_in.isReady(clockEdge())) {
+      peek(memQueue_in, MemoryMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == MemoryRequestType:MEMORY_READ) {
+          trigger(Event:MemData, in_msg.addr, entry, tbe);
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+        } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) {
+          trigger(Event:WBAck, in_msg.addr, entry, tbe); // ignore WBAcks, don't care about them.
+        } else {
+          DPRINTF(RubySlicc, "%s\n", in_msg.Type);
+          error("Invalid message");
+        }
+      }
+    }
+  }
+
+  in_port(requestNetwork_in, CPURequestMsg, requestFromCores, rank=0) {
+    if (requestNetwork_in.isReady(clockEdge())) {
+      peek(requestNetwork_in, CPURequestMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == CoherenceRequestType:RdBlk) {
+          trigger(Event:RdBlk, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+          trigger(Event:RdBlkS, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+          trigger(Event:RdBlkM, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+          trigger(Event:WriteThrough, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+          trigger(Event:Atomic, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+          if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+            DPRINTF(RubySlicc, "Dropping VicDirty for address %s\n", in_msg.addr);
+            trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+          } else {
+            DPRINTF(RubySlicc, "Got VicDirty from %s on %s\n", in_msg.Requestor, in_msg.addr);
+            trigger(Event:VicDirty, in_msg.addr, entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+          if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+            DPRINTF(RubySlicc, "Dropping VicClean for address %s\n", in_msg.addr);
+            trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+          } else {
+            DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr);
+            trigger(Event:VicClean, in_msg.addr, entry, tbe);
+          }
+        } else {
+          error("Bad request message type");
+        }
+      }
+    }
+  }
+
+  // Actions
+  action(s_sendResponseS, "s", desc="send Shared response") {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(tbe.OriginalRequestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := false;
+      out_msg.State := CoherenceState:Shared;
+      out_msg.InitialRequestTime := tbe.InitialRequestTime;
+      out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+      out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(es_sendResponseES, "es", desc="send Exclusive or Shared response") {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(tbe.OriginalRequestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Cached) {
+        out_msg.State := CoherenceState:Shared;
+      } else {
+        out_msg.State := CoherenceState:Exclusive;
+      }
+      out_msg.InitialRequestTime := tbe.InitialRequestTime;
+      out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+      out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(m_sendResponseM, "m", desc="send Modified response") {
+    if (tbe.wtData) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:UnblockWriteThrough;
+      }
+    }else{
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        if (tbe.L3Hit) {
+          out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+        } else {
+          out_msg.Sender := machineID;
+        }
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := tbe.Dirty;
+        out_msg.State := CoherenceState:Modified;
+        out_msg.CtoD := false;
+        out_msg.InitialRequestTime := tbe.InitialRequestTime;
+        out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+        out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+        out_msg.OriginalResponder := tbe.LastSender;
+        if(tbe.atomicData){
+          out_msg.WTRequestor := tbe.WTRequestor;
+        }
+        out_msg.L3Hit := tbe.L3Hit;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+      if (tbe.atomicData) {
+        enqueue(triggerQueue_out, TriggerMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := TriggerType:UnblockWriteThrough;
+        }
+      }
+    }
+  }
+
+  action(c_sendResponseCtoD, "c", desc="send CtoD Ack") {
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        out_msg.Dirty := false;
+        out_msg.State := CoherenceState:Modified;
+        out_msg.CtoD := true;
+        out_msg.InitialRequestTime := tbe.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+  }
+
+  action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysWBAck;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(l_queueMemWBReq, "lq", desc="Write WB data to memory") {
+    peek(responseNetwork_in, ResponseMsg) {
+      queueMemoryWrite(machineID, address, to_memory_controller_latency,
+                       in_msg.DataBlk);
+    }
+  }
+
+  action(l_queueMemRdReq, "lr", desc="Read data from memory") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) {
+          out_msg.addr := address;
+          out_msg.Type := TriggerType:L3Hit;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        if (tbe.Dirty == false) {
+          tbe.DataBlk := entry.DataBlk;
+        }
+        tbe.LastSender := entry.LastSender;
+        tbe.L3Hit := true;
+        tbe.MemData := true;
+        L3CacheMemory.deallocate(address);
+      } else {
+        queueMemoryRead(machineID, address, to_memory_controller_latency);
+      }
+    }
+  }
+
+  action(dc_probeInvCoreData, "dc", desc="probe inv cores, return data") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbInv;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination.broadcast(MachineType:CorePair);  // won't be realistic for multisocket
+
+        // add relevant TCC node to list. This replaces all TCPs and SQCs
+        if (((in_msg.Type == CoherenceRequestType:WriteThrough ||
+              in_msg.Type == CoherenceRequestType:Atomic) &&
+             in_msg.NoWriteConflict) ||
+            CPUonly) {
+        } else if (noTCCdir) {
+          out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                  TCC_select_low_bit, TCC_select_num_bits));
+        } else {
+	      out_msg.Destination.add(mapAddressToRange(address,
+                                                    MachineType:TCCdir,
+                            TCC_select_low_bit, TCC_select_num_bits));
+        }
+        out_msg.Destination.remove(in_msg.Requestor);
+        tbe.NumPendingAcks := out_msg.Destination.count();
+        if (tbe.NumPendingAcks == 0) {
+          enqueue(triggerQueue_out, TriggerMsg, 1) {
+            out_msg.addr := address;
+            out_msg.Type := TriggerType:AcksComplete;
+          }
+        }
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+        APPEND_TRANSITION_COMMENT(" dc: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") {
+    peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbDowngrade;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination.broadcast(MachineType:CorePair);  // won't be realistic for multisocket
+        // add relevant TCC node to the list. This replaces all TCPs and SQCs
+        if (noTCCdir || CPUonly) {
+          //Don't need to notify TCC about reads
+        } else {
+	      out_msg.Destination.add(mapAddressToRange(address,
+                                                    MachineType:TCCdir,
+                            TCC_select_low_bit, TCC_select_num_bits));
+          tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+        }
+        if (noTCCdir && !CPUonly) {
+          out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                  TCC_select_low_bit, TCC_select_num_bits));
+        }
+        out_msg.Destination.remove(in_msg.Requestor);
+        tbe.NumPendingAcks := out_msg.Destination.count();
+        if (tbe.NumPendingAcks == 0) {
+          enqueue(triggerQueue_out, TriggerMsg, 1) {
+            out_msg.addr := address;
+            out_msg.Type := TriggerType:AcksComplete;
+          }
+        }
+        DPRINTF(RubySlicc, "%s\n", (out_msg));
+        APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(ic_probeInvCore, "ic", desc="probe invalidate core, no return data needed") {
+    peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbInv;
+        out_msg.ReturnData := false;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination.broadcast(MachineType:CorePair);  // won't be realistic for multisocket
+
+        // add relevant TCC node to the list. This replaces all TCPs and SQCs
+        if (noTCCdir && !CPUonly) {
+            out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+        } else {
+            if (!noTCCdir) {
+                out_msg.Destination.add(mapAddressToRange(address,
+                                                          MachineType:TCCdir,
+                                                          TCC_select_low_bit,
+                                                          TCC_select_num_bits));
+            }
+        }
+        out_msg.Destination.remove(in_msg.Requestor);
+        tbe.NumPendingAcks := out_msg.Destination.count();
+        if (tbe.NumPendingAcks == 0) {
+          enqueue(triggerQueue_out, TriggerMsg, 1) {
+            out_msg.addr := address;
+            out_msg.Type := TriggerType:AcksComplete;
+          }
+        }
+        APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(d_writeDataToMemory, "d", desc="Write data to memory") {
+    peek(responseNetwork_in, ResponseMsg) {
+      getDirectoryEntry(address).DataBlk := in_msg.DataBlk;
+      if (tbe.Dirty == false) {
+          // have to update the TBE, too, because of how this
+          // directory deals with functional writes
+        tbe.DataBlk := in_msg.DataBlk;
+      }
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    peek(requestNetwork_in, CPURequestMsg) {
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.writeMask.clear();
+        tbe.writeMask.orMask(in_msg.writeMask);
+        tbe.wtData := true;
+        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.LastSender := in_msg.Requestor;
+      }
+      if (in_msg.Type == CoherenceRequestType:Atomic) {
+        tbe.writeMask.clear();
+        tbe.writeMask.orMask(in_msg.writeMask);
+        tbe.atomicData := true;
+        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.LastSender := in_msg.Requestor;
+      }
+      tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs
+      tbe.Dirty := false;
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask);
+        tbe.Dirty := true;
+      }
+      tbe.OriginalRequestor := in_msg.Requestor;
+      tbe.NumPendingAcks := 0;
+      tbe.Cached := in_msg.ForceShared;
+      tbe.InitialRequestTime := in_msg.InitialRequestTime;
+    }
+  }
+
+  action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+    if (tbe.Dirty == false) {
+        getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    }
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(wd_writeBackData, "wd", desc="Write back data if needed") {
+    if (tbe.wtData) {
+      getDirectoryEntry(address).DataBlk.copyPartial(tbe.DataBlk, tbe.writeMask);
+    } else if (tbe.atomicData) {
+      tbe.DataBlk.atomicPartial(getDirectoryEntry(address).DataBlk,tbe.writeMask);
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    } else if (tbe.Dirty == false) {
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    }
+  }
+
+  action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") {
+    peek(memQueue_in, MemoryMsg) {
+      if (tbe.wtData == true) {
+          // do nothing
+      } else if (tbe.Dirty == false) {
+        tbe.DataBlk := getDirectoryEntry(address).DataBlk;
+      }
+      tbe.MemData := true;
+    }
+  }
+
+  action(y_writeProbeDataToTBE, "y", desc="write Probe Data to TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.Dirty) {
+        if (tbe.wtData) {
+          DataBlock tmp := in_msg.DataBlk;
+          tmp.copyPartial(tbe.DataBlk,tbe.writeMask);
+          tbe.DataBlk := tmp;
+          tbe.writeMask.fillMask();
+        } else if (tbe.Dirty) {
+          if(tbe.atomicData == false && tbe.wtData == false) {
+            DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender);
+            assert(tbe.DataBlk == in_msg.DataBlk);  // in case of double data
+          }
+        } else {
+          tbe.DataBlk := in_msg.DataBlk;
+          tbe.Dirty := in_msg.Dirty;
+          tbe.LastSender := in_msg.Sender;
+        }
+      }
+      if (in_msg.Hit) {
+        tbe.Cached := true;
+      }
+    }
+  }
+
+  action(mwc_markSinkWriteCancel, "mwc", desc="Mark to sink impending VicDirty") {
+    peek(responseNetwork_in, ResponseMsg) {
+      getDirectoryEntry(address).VicDirtyIgnore.add(in_msg.Sender);
+      APPEND_TRANSITION_COMMENT(" setting bit to sink VicDirty ");
+    }
+  }
+
+  action(x_decrementAcks, "x", desc="decrement Acks pending") {
+    tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+    APPEND_TRANSITION_COMMENT(" Acks remaining: ");
+    APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+  }
+
+  action(o_checkForCompletion, "o", desc="check for ack completion") {
+    if (tbe.NumPendingAcks == 0) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:AcksComplete;
+      }
+    }
+    APPEND_TRANSITION_COMMENT(" Check: Acks remaining: ");
+    APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+  }
+
+  action(rv_removeVicDirtyIgnore, "rv", desc="Remove ignored core") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor);
+    }
+  }
+
+  action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+        entry.DataBlk := in_msg.DataBlk;
+        entry.LastSender := in_msg.Sender;
+      } else {
+        if (L3CacheMemory.cacheAvail(address) == false) {
+          Addr victim := L3CacheMemory.cacheProbe(address);
+          CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+                                                 L3CacheMemory.lookup(victim));
+          queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+                           victim_entry.DataBlk);
+          L3CacheMemory.deallocate(victim);
+        }
+        assert(L3CacheMemory.cacheAvail(address));
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+        entry.DataBlk := in_msg.DataBlk;
+
+        entry.LastSender := in_msg.Sender;
+      }
+    }
+  }
+
+  action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") {
+    if ((tbe.wtData || tbe.atomicData) && useL3OnWT) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+        entry.DataBlk := tbe.DataBlk;
+        entry.LastSender := tbe.LastSender;
+      } else {
+        if (L3CacheMemory.cacheAvail(address) == false) {
+          Addr victim := L3CacheMemory.cacheProbe(address);
+          CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+                                                 L3CacheMemory.lookup(victim));
+          queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+                           victim_entry.DataBlk);
+          L3CacheMemory.deallocate(victim);
+        }
+        assert(L3CacheMemory.cacheAvail(address));
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+        entry.DataBlk := tbe.DataBlk;
+        entry.LastSender := tbe.LastSender;
+      }
+    }
+  }
+
+  action(sf_setForwardReqTime, "sf", desc="...") {
+    tbe.ForwardRequestTime := curCycle();
+  }
+
+  action(dl_deallocateL3, "dl", desc="deallocate the L3 block") {
+    L3CacheMemory.deallocate(address);
+  }
+
+  action(p_popRequestQueue, "p", desc="pop request queue") {
+    requestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="pop response queue") {
+    responseNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pm_popMemQueue, "pm", desc="pop mem queue") {
+    memQueue_in.dequeue(clockEdge());
+  }
+
+  action(pt_popTriggerQueue, "pt", desc="pop trigger queue") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(ptl_popTriggerQueue, "ptl", desc="pop L3 trigger queue") {
+    L3TriggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(pu_popUnblockQueue, "pu", desc="pop unblock queue") {
+    unblockNetwork_in.dequeue(clockEdge());
+  }
+
+  action(zz_recycleRequestQueue, "zz", desc="recycle request queue") {
+    requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(yy_recycleResponseQueue, "yy", desc="recycle response queue") {
+    responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") {
+    stall_and_wait(requestNetwork_in, address);
+  }
+
+  action(wa_wakeUpDependents, "wa", desc="Wake up any requests waiting for this address") {
+    wakeUpBuffers(address);
+  }
+
+  action(wa_wakeUpAllDependents, "waa", desc="Wake up any requests waiting for this region") {
+    wakeUpAllBuffers();
+  }
+
+  action(z_stall, "z", desc="...") {
+  }
+
+  // TRANSITIONS
+  transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) {
+      st_stallAndWaitRequest;
+  }
+
+  // It may be possible to save multiple invalidations here!
+  transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {Atomic, WriteThrough}) {
+      st_stallAndWaitRequest;
+  }
+
+
+  // transitions from U
+  transition(U, {RdBlkS}, BS_PM) {L3TagArrayRead} {
+    t_allocateTBE;
+    l_queueMemRdReq;
+    sc_probeShrCoreData;
+    p_popRequestQueue;
+  }
+
+  transition(U, WriteThrough, BM_PM) {L3TagArrayRead, L3TagArrayWrite} {
+    t_allocateTBE;
+    w_sendResponseWBAck;
+    l_queueMemRdReq;
+    dc_probeInvCoreData;
+    p_popRequestQueue;
+  }
+
+  transition(U, Atomic, BM_PM) {L3TagArrayRead, L3TagArrayWrite} {
+    t_allocateTBE;
+    l_queueMemRdReq;
+    dc_probeInvCoreData;
+    p_popRequestQueue;
+  }
+
+  transition(U, {RdBlkM}, BM_PM) {L3TagArrayRead} {
+    t_allocateTBE;
+    l_queueMemRdReq;
+    dc_probeInvCoreData;
+    p_popRequestQueue;
+  }
+
+  transition(U, RdBlk, B_PM) {L3TagArrayRead}{
+    t_allocateTBE;
+    l_queueMemRdReq;
+    sc_probeShrCoreData;
+    p_popRequestQueue;
+  }
+
+  transition(U, CtoD, BP) {L3TagArrayRead} {
+    t_allocateTBE;
+    ic_probeInvCore;
+    p_popRequestQueue;
+  }
+
+  transition(U, VicDirty, BL) {L3TagArrayRead} {
+    t_allocateTBE;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(U, VicClean, BL) {L3TagArrayRead} {
+    t_allocateTBE;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(BL, {VicDirty, VicClean}) {
+    zz_recycleRequestQueue;
+  }
+
+  transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} {
+    d_writeDataToMemory;
+    al_allocateL3Block;
+    wa_wakeUpDependents;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(BL, StaleWB, U) {L3TagArrayWrite} {
+    dt_deallocateTBE;
+    wa_wakeUpAllDependents;
+    pr_popResponseQueue;
+  }
+
+  transition({B, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm}, {VicDirty, VicClean}) {
+    z_stall;
+  }
+
+  transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, WBAck) {
+    pm_popMemQueue;
+  }
+
+  transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, StaleVicDirty) {
+    rv_removeVicDirtyIgnore;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition({B}, CoreUnblock, U) {
+    wa_wakeUpDependents;
+    pu_popUnblockQueue;
+  }
+
+  transition(B, UnblockWriteThrough, U) {
+    wa_wakeUpDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(BS_PM, MemData, BS_Pm) {} {
+    mt_writeMemDataToTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BM_PM, MemData, BM_Pm){} {
+    mt_writeMemDataToTBE;
+    pm_popMemQueue;
+  }
+
+  transition(B_PM, MemData, B_Pm){} {
+    mt_writeMemDataToTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BS_PM, L3Hit, BS_Pm) {} {
+    ptl_popTriggerQueue;
+  }
+
+  transition(BM_PM, L3Hit, BM_Pm) {} {
+    ptl_popTriggerQueue;
+  }
+
+  transition(B_PM, L3Hit, B_Pm) {} {
+    ptl_popTriggerQueue;
+  }
+
+  transition(BS_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BM_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(B_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BS_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} {
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition(BM_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} {
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition(B_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} {
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition({BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, BP}, CPUPrbResp) {
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    o_checkForCompletion;
+    pr_popResponseQueue;
+  }
+
+  transition(BS_PM, ProbeAcksComplete, BS_M) {} {
+    sf_setForwardReqTime;
+    pt_popTriggerQueue;
+  }
+
+  transition(BM_PM, ProbeAcksComplete, BM_M) {} {
+    sf_setForwardReqTime;
+    pt_popTriggerQueue;
+  }
+
+  transition(B_PM, ProbeAcksComplete, B_M){} {
+    sf_setForwardReqTime;
+    pt_popTriggerQueue;
+  }
+
+  transition(BS_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    sf_setForwardReqTime;
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(BM_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    sf_setForwardReqTime;
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(B_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    sf_setForwardReqTime;
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(BP, ProbeAcksComplete, B){L3TagArrayWrite, L3TagArrayWrite} {
+    sf_setForwardReqTime;
+    c_sendResponseCtoD;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base-msg.sm b/src/mem/protocol/MOESI_AMD_Base-msg.sm
new file mode 100644
index 000000000..ff8842369
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-msg.sm
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+
+enumeration(CoherenceRequestType, desc="Coherence Request Types") {
+  // CPU Request Types ONLY
+  RdBlk,        desc="Read Blk";
+  RdBlkM,       desc="Read Blk Modified";
+  RdBlkS,       desc="Read Blk Shared";
+  CtoD,         desc="Change To Dirty";
+  VicClean,     desc="L2 clean eviction";
+  VicDirty,     desc="L2 dirty eviction";
+  Atomic,       desc="Upper level atomic";
+  AtomicWriteBack, desc="Upper level atomic";
+  WriteThrough, desc="Ordered WriteThrough w/Data";
+  WriteThroughFifo, desc="WriteThrough with no data";
+  WriteThroughDummy, desc="WriteThrough with no data for atomic operation";
+  WriteFlush,   desc="Release Flush";
+
+  WrCancel,     desc="want to cancel WB to Memory"; // should this be here?
+
+  WBApproval,   desc="WB Approval";
+
+  // Messages between Dir and R-Dir
+  ForceInv,     desc="Send invalide to the block";
+  ForceDowngrade, desc="Send downgrade to the block";
+  Unblock,      desc="Used to let the dir know a message has been sunk";
+
+  // Messages between R-Dir and R-Buffer
+  PrivateNotify, desc="Let region buffer know it has private access";
+  SharedNotify,  desc="Let region buffer know it has shared access";
+  WbNotify,      desc="Let region buffer know it saw its wb request";
+  Downgrade,     desc="Force the region buffer to downgrade to shared";
+  // Response to R-Dir (probably should be on a different network, but
+  // I need it to be ordered with respect to requests)
+  InvAck,       desc="Let the R-Dir know when the inv has occured";
+
+  PrivateRequest, desc="R-buf wants the region in private";
+  UpgradeRequest, desc="R-buf wants the region in private";
+  SharedRequest,  desc="R-buf wants the region in shared (could respond with private)";
+  CleanWbRequest, desc="R-buf wants to deallocate clean region";
+
+  NA,             desc="So we don't get segfaults";
+}
+
+enumeration(ProbeRequestType, desc="Probe Request Types") {
+  PrbDowngrade,    desc="Probe for Status";  // EtoS, MtoO, StoS
+  PrbInv,       desc="Probe to Invalidate";
+
+  // For regions
+  PrbRepl,      desc="Force the cache to do a replacement";
+  PrbRegDowngrade, desc="Probe for Status";  // EtoS, MtoO, StoS
+  PrbAtomic,    desc="Forwarded Atomic Operation";
+}
+
+
+enumeration(CoherenceResponseType, desc="Coherence Response Types") {
+  NBSysResp,       desc="Northbridge response to CPU Rd request";
+  NBSysWBAck,      desc="Northbridge response ok to WB";
+  TDSysResp,       desc="TCCdirectory response to CPU Rd request";
+  TDSysWBAck,      desc="TCCdirectory response ok to WB";
+  TDSysWBNack,     desc="TCCdirectory response ok to drop";
+  CPUPrbResp,      desc="CPU Probe Response";
+  CPUData,         desc="CPU Data";
+  StaleNotif,      desc="Notification of Stale WBAck, No data to writeback";
+  CPUCancelWB,     desc="want to cancel WB to Memory";
+  MemData,         desc="Data from Memory";
+
+  // for regions
+  PrivateAck,      desc="Ack that r-buf received private notify";
+  RegionWbAck,     desc="Writeback Ack that r-buf completed deallocation";
+  DirReadyAck,     desc="Directory (mem ctrl)<->region dir handshake";
+}
+
+enumeration(CoherenceState, default="CoherenceState_NA", desc="Coherence State") {
+  Modified,             desc="Modified";
+  Owned,                desc="Owned state";
+  Exclusive,            desc="Exclusive";
+  Shared,               desc="Shared";
+  NA,                   desc="NA";
+}
+
+structure(CPURequestMsg, desc="...", interface="Message") {
+  Addr addr,             desc="Physical address for this request";
+  Addr DemandAddress,       desc="Physical block address for this request";
+  CoherenceRequestType Type,   desc="Type of request";
+  DataBlock DataBlk,           desc="data for the cache line";  // only for WB
+  bool Dirty,                   desc="whether WB data is dirty";  // only for WB
+  MachineID Requestor,            desc="Node who initiated the request";
+  NetDest Destination,             desc="Multicast destination mask";
+  bool Shared,                  desc="For CPU_WrVicBlk, vic is O not M.  For CPU_ClVicBlk, vic is S";
+  MessageSizeType MessageSize, desc="size category of the message";
+  Cycles InitialRequestTime, desc="time the initial requests was sent from the L1Cache";
+  Cycles ForwardRequestTime, desc="time the dir forwarded the request";
+  Cycles ProbeRequestStartTime, desc="the time the dir started the probe request";
+  bool DemandRequest, default="false", desc="For profiling purposes";
+
+  NetDest Sharers,              desc="Caches that may have a valid copy of the data";
+  bool ForceShared,             desc="R-dir knows it is shared, pass on so it sends an S copy, not E";
+  bool Private, default="false", desc="Requestor already has private permissions, no need for dir check";
+  bool CtoDSinked, default="false", desc="This is true if the CtoD previously sent must have been sunk";
+
+  bool NoAckNeeded, default="false", desc="True if region buffer doesn't need to ack";
+  int Acks, default="0", desc="Acks that the dir (mem ctrl) should expect to receive";
+  CoherenceRequestType OriginalType, default="CoherenceRequestType_NA",  desc="Type of request from core fwded through region buffer";
+  WriteMask writeMask, desc="Write Through Data";
+  MachineID WTRequestor,            desc="Node who initiated the write through";
+  HSAScope scope,                      default="HSAScope_SYSTEM", desc="Request Scope";
+  int wfid,                         default="0", desc="wavefront id";
+  bool NoWriteConflict,             default="true", desc="write collided with CAB entry";
+  int ProgramCounter,               desc="PC that accesses to this block";
+
+  bool functionalRead(Packet *pkt) {
+    // Only PUTX messages contains the data block
+    if (Type == CoherenceRequestType:VicDirty) {
+        return testAndRead(addr, DataBlk, pkt);
+    }
+
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return testAndWrite(addr, DataBlk, pkt);
+  }
+}
+
+structure(NBProbeRequestMsg, desc="...", interface="Message") {
+  Addr addr,              desc="Physical address for this request";
+  ProbeRequestType Type,             desc="NB_PrbNxtState signal";
+  bool ReturnData,              desc="Indicates CPU should return data";
+  NetDest Destination,             desc="Node to whom the data is sent";
+  MessageSizeType MessageSize, desc="size category of the message";
+  bool DemandRequest, default="false", desc="demand request, requesting 3-hop transfer";
+  Addr DemandAddress,        desc="Demand block address for a region request";
+  MachineID Requestor,          desc="Requestor id for 3-hop requests";
+  bool NoAckNeeded, default="false", desc="For short circuting acks";
+  int ProgramCounter,           desc="PC that accesses to this block";
+
+  bool functionalRead(Packet *pkt) {
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return false;
+  }
+
+}
+
+structure(TDProbeRequestMsg, desc="...", interface="Message") {
+  Addr addr,              desc="Physical address for this request";
+  ProbeRequestType Type,  desc="TD_PrbNxtState signal";
+  bool ReturnData,        desc="Indicates CPU should return data";
+  bool localCtoD,         desc="Indicates CtoD is within the GPU hierarchy (aka TCC subtree)";
+  NetDest Destination,    desc="Node to whom the data is sent";
+  MessageSizeType MessageSize, desc="size category of the message";
+  int Phase,              desc="Synchronization Phase";
+  int wfid,               desc="wavefront id for Release";
+  MachineID Requestor,    desc="Node who initiated the request";
+
+  bool functionalRead(Packet *pkt) {
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return false;
+  }
+}
+
+// Response Messages seemed to be easily munged into one type
+structure(ResponseMsg, desc="...", interface="Message") {
+  Addr addr,             desc="Physical address for this request";
+  CoherenceResponseType Type,  desc="NB Sys Resp or CPU Response to Probe";
+  MachineID Sender,               desc="Node who sent the data";
+  NetDest Destination,             desc="Node to whom the data is sent";
+  // Begin Used Only By CPU Response
+  DataBlock DataBlk,           desc="data for the cache line";
+  bool Hit,                    desc="probe hit valid line";
+  bool Shared,                 desc="True if S, or if NB Probe ReturnData==1 && O";
+  bool Dirty,                  desc="Is the data dirty (different than memory)?";
+  bool Ntsl,                   desc="indicates probed lin will be invalid after probe";
+  bool UntransferredOwner,     desc="pending confirmation of ownership change";
+  // End Used Only By CPU Response
+
+  // Begin NB Response Only
+  CoherenceState State, default=CoherenceState_NA,        desc="What returned data from NB should be in";
+  bool CtoD,                    desc="was the originator a CtoD?";
+  // End NB Response Only
+
+  // Normally if a block gets hit by a probe while waiting to be written back,
+  // you flip the NbReqShared signal (part of the CPURequest signal group).
+  // But since this is in packets and I don't want to send a separate packet,
+  // let's just send this signal back with the data instead
+  bool NbReqShared,             desc="modification of Shared field from initial request, e.g. hit by shared probe";
+
+  MessageSizeType MessageSize, desc="size category of the message";
+  Cycles InitialRequestTime, desc="time the initial requests was sent from the L1Cache";
+  Cycles ForwardRequestTime, desc="time the dir forwarded the request";
+  Cycles ProbeRequestStartTime, desc="the time the dir started the probe request";
+  bool DemandRequest, default="false", desc="For profiling purposes";
+
+  bool L3Hit, default="false", desc="Did memory or L3 supply the data?";
+  MachineID OriginalResponder, desc="Mach which wrote the data to the L3";
+  MachineID WTRequestor,             desc="Node who started the writethrough";
+
+  bool NotCached, default="false", desc="True when the Region buffer has already evicted the line";
+
+  bool NoAckNeeded, default="false", desc="For short circuting acks";
+  bool isValid, default="false", desc="Is acked block valid";
+  int wfid, default="0", desc="wavefront id";
+  int Phase,                   desc="Synchronization Phase";
+
+  int ProgramCounter,       desc="PC that issues this request";
+  bool mispred,              desc="tell TCP if the block should not be bypassed";
+
+
+  bool functionalRead(Packet *pkt) {
+    // Only PUTX messages contains the data block
+    if (Type == CoherenceResponseType:CPUData ||
+        Type == CoherenceResponseType:MemData) {
+        return testAndRead(addr, DataBlk, pkt);
+    }
+
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return testAndWrite(addr, DataBlk, pkt);
+  }
+}
+
+structure(UnblockMsg, desc="...", interface="Message") {
+  Addr addr,              desc="Physical address for this request";
+  NetDest Destination,          desc="Destination (always directory)";
+  MessageSizeType MessageSize, desc="size category of the message";
+  MachineID Sender,               desc="Node who sent the data";
+  bool currentOwner, default="false", desc="Is the sender the current owner";
+  bool DoneAck, default="false", desc="Is this a done ack?";
+  bool Dirty, default="false", desc="Was block dirty when evicted";
+  bool wasValid, default="false", desc="Was block valid when evicted";
+  bool valid, default="false", desc="Is block valid";
+  bool validToInvalid, default="false", desc="Was block valid when evicted";
+
+  bool functionalRead(Packet *pkt) {
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return false;
+  }
+}
+
+enumeration(TriggerType, desc="Trigger Type") {
+  L2_to_L1,             desc="L2 to L1 fill";
+  AcksComplete,         desc="NB received all needed Acks";
+
+  // For regions
+  InvNext,              desc="Invalidate the next block";
+  PrivateAck,           desc="Loopback ack for machines with no Region Buffer";
+  AllOutstanding,       desc="All outstanding requests have finished";
+  L3Hit,                desc="L3 hit in dir";
+
+  // For region directory once the directory is blocked
+  InvRegion,            desc="Invalidate region";
+  DowngradeRegion,      desc="downgrade region";
+  //For writethrough
+  UnblockWriteThrough,  desc="unblock";
+  WriteData,            desc="Write to full cacheblock data";
+  WriteDone,            desc="Sequencer says that write is done";
+  AtomicDone,           desc="Atomic is done";
+}
+
+enumeration(CacheId, desc="Which Cache in the Core") {
+  L1I,          desc="L1 I-cache";
+  L1D0,         desc="L1 D-cache cluster 0";
+  L1D1,         desc="L1 D-cache cluster 1";
+  NA,           desc="Default";
+}
+
+structure(TriggerMsg, desc="...", interface="Message") {
+  Addr addr,              desc="Address";
+  TriggerType Type,             desc="Type of trigger";
+  CacheId Dest,         default="CacheId_NA", desc="Cache to invalidate";
+  int ProgramCounter,           desc="PC that accesses to this block";
+
+  bool functionalRead(Packet *pkt) {
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return false;
+  }
+
+}
+
+enumeration(FifoType, desc="Fifo Type") {
+  WriteDummy,          desc="Dummy Write for atomic operation";
+  WriteThrough,        desc="simple writethrough request";
+  WriteFlush,          desc="synchronization message";
+}
+
+structure(FifoMsg, desc="...", interface="Message") {
+  Addr addr,          desc="Address";
+  FifoType Type,            desc="WriteThrough/WriteFlush";
+  int wfid,                 default="0",desc="wavefront id";
+  MachineID Requestor,      desc="Flush Requestor";
+  MachineID oRequestor,      desc="original Flush Requestor";
+
+  bool functionalRead(Packet *pkt) {
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return false;
+  }
+
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base-probeFilter.sm b/src/mem/protocol/MOESI_AMD_Base-probeFilter.sm
new file mode 100644
index 000000000..f545c2fa7
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-probeFilter.sm
@@ -0,0 +1,1408 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu, Sooraj Puthoor
+ */
+
+/*
+ * This file is based on MOESI_AMD_Base.sm
+ * Differences with AMD base protocol
+ * -- Uses a probe filter memory to track sharers.
+ * -- The probe filter can be inclusive or non-inclusive
+ * -- Only two sharers tracked. Sharers are a) GPU or/and  b) CPU
+ * -- If sharer information available, the sharer is probed
+ * -- If sharer information not available, probes are broadcasted
+ */
+
+machine(MachineType:Directory, "AMD Baseline protocol")
+: DirectoryMemory * directory;
+  CacheMemory * L3CacheMemory;
+  CacheMemory * ProbeFilterMemory;
+  Cycles response_latency := 5;
+  Cycles l3_hit_latency := 50;
+  bool noTCCdir := "False";
+  bool CAB_TCC := "False";
+  int TCC_select_num_bits:=1;
+  bool useL3OnWT := "False";
+  bool inclusiveDir := "True";
+  Cycles to_memory_controller_latency := 1;
+
+  // From the Cores
+  MessageBuffer * requestFromCores, network="From", virtual_network="0", ordered="false", vnet_type="request";
+  MessageBuffer * responseFromCores, network="From", virtual_network="2", ordered="false", vnet_type="response";
+  MessageBuffer * unblockFromCores, network="From", virtual_network="4", ordered="false", vnet_type="unblock";
+
+  MessageBuffer * probeToCore, network="To", virtual_network="0", ordered="false", vnet_type="request";
+  MessageBuffer * responseToCore, network="To", virtual_network="2", ordered="false", vnet_type="response";
+
+  MessageBuffer * triggerQueue, ordered="true";
+  MessageBuffer * L3triggerQueue, ordered="true";
+  MessageBuffer * responseFromMemory;
+{
+  // STATES
+  state_declaration(State, desc="Directory states", default="Directory_State_U") {
+    U, AccessPermission:Backing_Store,                 desc="unblocked";
+    BL, AccessPermission:Busy,                  desc="got L3 WB request";
+    // BL is Busy because it is busy waiting for the data
+    // which is possibly in the network. The cache which evicted the data
+    // might have moved to some other state after doing the eviction
+    // BS==> Received a read request; has not requested ownership
+    // B==> Received a read request; has requested ownership
+    // BM==> Received a modification request
+    B_P, AccessPermission:Backing_Store,      desc="Back invalidation, waiting for probes";
+    BS_M, AccessPermission:Backing_Store,     desc="blocked waiting for memory";
+    BM_M, AccessPermission:Backing_Store,     desc="blocked waiting for memory";
+    B_M, AccessPermission:Backing_Store,      desc="blocked waiting for memory";
+    BP, AccessPermission:Backing_Store,       desc="blocked waiting for probes, no need for memory";
+    BS_PM, AccessPermission:Backing_Store,    desc="blocked waiting for probes and Memory";
+    BM_PM, AccessPermission:Backing_Store,    desc="blocked waiting for probes and Memory";
+    B_PM, AccessPermission:Backing_Store,     desc="blocked waiting for probes and Memory";
+    BS_Pm, AccessPermission:Backing_Store,    desc="blocked waiting for probes, already got memory";
+    BM_Pm, AccessPermission:Backing_Store,    desc="blocked waiting for probes, already got memory";
+    B_Pm, AccessPermission:Backing_Store,     desc="blocked waiting for probes, already got memory";
+    B, AccessPermission:Backing_Store,        desc="sent response, Blocked til ack";
+  }
+
+  // Events
+  enumeration(Event, desc="Directory events") {
+    // CPU requests
+    RdBlkS,             desc="...";
+    RdBlkM,             desc="...";
+    RdBlk,              desc="...";
+    CtoD,               desc="...";
+    WriteThrough,       desc="WriteThrough Message";
+    Atomic,             desc="Atomic Message";
+
+    // writebacks
+    VicDirty,           desc="...";
+    VicClean,           desc="...";
+    CPUData,            desc="WB data from CPU";
+    StaleWB,         desc="Notification that WB has been superceded by a probe";
+
+    // probe responses
+    CPUPrbResp,            desc="Probe Response Msg";
+
+    ProbeAcksComplete,  desc="Probe Acks Complete";
+
+    L3Hit,              desc="Hit in L3 return data to core";
+
+    // Replacement
+    PF_Repl,            desc="Replace address from probe filter";
+
+    // Memory Controller
+    MemData, desc="Fetched data from memory arrives";
+    WBAck, desc="Writeback Ack from memory arrives";
+
+    CoreUnblock,            desc="Core received data, unblock";
+    UnblockWriteThrough,    desc="Unblock because of writethrough request finishing";
+
+    StaleVicDirty,        desc="Core invalidated before VicDirty processed";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    L3DataArrayRead,    desc="Read the data array";
+    L3DataArrayWrite,   desc="Write the data array";
+    L3TagArrayRead,     desc="Read the data array";
+    L3TagArrayWrite,    desc="Write the data array";
+
+    PFTagArrayRead,     desc="Read the data array";
+    PFTagArrayWrite,    desc="Write the data array";
+  }
+
+  // TYPES
+
+  enumeration(ProbeFilterState, desc="") {
+    T,  desc="Tracked";
+    NT, desc="Not tracked";
+    B, desc="Blocked, This entry is being replaced";
+  }
+
+  // DirectoryEntry
+  structure(Entry, desc="...", interface="AbstractEntry") {
+    State DirectoryState,          desc="Directory state";
+    DataBlock DataBlk,             desc="data for the block";
+    NetDest VicDirtyIgnore,  desc="VicDirty coming from whom to ignore";
+  }
+
+  structure(CacheEntry, desc="...", interface="AbstractCacheEntry") {
+    DataBlock DataBlk,          desc="data for the block";
+    MachineID LastSender,       desc="Mach which this block came from";
+    ProbeFilterState pfState,   desc="ProbeFilter state",default="Directory_ProbeFilterState_NT";
+    bool isOnCPU,               desc="Block valid in the CPU complex",default="false";
+    bool isOnGPU,               desc="Block valid in the GPU complex",default="false";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,     desc="Transient state";
+    DataBlock DataBlk,  desc="data for the block";
+    bool Dirty,         desc="Is the data dirty?";
+    int NumPendingAcks,        desc="num acks expected";
+    MachineID OriginalRequestor,        desc="Original Requestor";
+    MachineID WTRequestor,        desc="WT Requestor";
+    bool Cached,        desc="data hit in Cache";
+    bool MemData,       desc="Got MemData?",default="false";
+    bool wtData,       desc="Got write through data?",default="false";
+    bool atomicData,   desc="Got Atomic op?",default="false";
+    Cycles InitialRequestTime, desc="...";
+    Cycles ForwardRequestTime, desc="...";
+    Cycles ProbeRequestStartTime, desc="...";
+    MachineID LastSender, desc="Mach which this block came from";
+    bool L3Hit, default="false", desc="Was this an L3 hit?";
+    uint64_t probe_id,        desc="probe id for lifetime profiling";
+    WriteMask writeMask,    desc="outstanding write through mask";
+    Addr demandAddress,  desc="Address of demand request which caused probe filter eviction";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
+
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_tbe(TBE a);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  Entry getDirectoryEntry(Addr addr), return_by_pointer="yes" {
+    Entry dir_entry := static_cast(Entry, "pointer", directory.lookup(addr));
+
+    if (is_valid(dir_entry)) {
+      //DPRINTF(RubySlicc, "Getting entry %s: %s\n", addr, dir_entry.DataBlk);
+      return dir_entry;
+    }
+
+    dir_entry :=  static_cast(Entry, "pointer",
+                              directory.allocate(addr, new Entry));
+    return dir_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if (is_valid(tbe) && tbe.MemData) {
+      DPRINTF(RubySlicc, "Returning DataBlk from TBE %s:%s\n", addr, tbe);
+      return tbe.DataBlk;
+    }
+    DPRINTF(RubySlicc, "Returning DataBlk from Dir %s:%s\n", addr, getDirectoryEntry(addr));
+    return getDirectoryEntry(addr).DataBlk;
+  }
+
+  State getState(TBE tbe, CacheEntry entry, Addr addr) {
+    CacheEntry probeFilterEntry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(addr));
+    if (inclusiveDir) {
+      if (is_valid(probeFilterEntry) && probeFilterEntry.pfState == ProbeFilterState:B) {
+        return State:B_P;
+      }
+    }
+    return getDirectoryEntry(addr).DirectoryState;
+  }
+
+  void setState(TBE tbe, CacheEntry entry, Addr addr, State state) {
+    getDirectoryEntry(addr).DirectoryState := state;
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes +
+        functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    // For this Directory, all permissions are just tracked in Directory, since
+    // it's not possible to have something in TBE but not Dir, just keep track
+    // of state all in one place.
+    if (directory.isPresent(addr)) {
+      return Directory_State_to_permission(getDirectoryEntry(addr).DirectoryState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(CacheEntry entry, Addr addr, State state) {
+    getDirectoryEntry(addr).changePermission(Directory_State_to_permission(state));
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L3DataArrayRead) {
+      L3CacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L3DataArrayWrite) {
+      L3CacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L3TagArrayRead) {
+      L3CacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L3TagArrayWrite) {
+      L3CacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    } else if (request_type == RequestType:PFTagArrayRead) {
+      ProbeFilterMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:PFTagArrayWrite) {
+      ProbeFilterMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L3DataArrayRead) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L3DataArrayWrite) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L3TagArrayRead) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L3TagArrayWrite) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:PFTagArrayRead) {
+      return ProbeFilterMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:PFTagArrayWrite) {
+      return ProbeFilterMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  bool isNotPresentProbeFilter(Addr address) {
+    if (ProbeFilterMemory.isTagPresent(address) ||
+        ProbeFilterMemory.cacheAvail(address)) {
+        return false;
+    }
+    return true;
+  }
+
+  bool isGPUSharer(Addr address) {
+    assert(ProbeFilterMemory.isTagPresent(address));
+    CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address));
+    if (entry.pfState == ProbeFilterState:NT) {
+       return true;
+    } else if (entry.isOnGPU){
+       return true;
+    }
+    return false;
+  }
+
+  bool isCPUSharer(Addr address) {
+    assert(ProbeFilterMemory.isTagPresent(address));
+    CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address));
+    if (entry.pfState == ProbeFilterState:NT) {
+       return true;
+    } else if (entry.isOnCPU){
+       return true;
+    }
+    return false;
+  }
+
+
+  // ** OUT_PORTS **
+  out_port(probeNetwork_out, NBProbeRequestMsg, probeToCore);
+  out_port(responseNetwork_out, ResponseMsg, responseToCore);
+
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+  out_port(L3TriggerQueue_out, TriggerMsg, L3triggerQueue);
+
+  // ** IN_PORTS **
+
+  // Trigger Queue
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=5) {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == TriggerType:AcksComplete) {
+          trigger(Event:ProbeAcksComplete, in_msg.addr, entry, tbe);
+        }else if (in_msg.Type == TriggerType:UnblockWriteThrough) {
+          trigger(Event:UnblockWriteThrough, in_msg.addr, entry, tbe);
+        } else {
+          error("Unknown trigger msg");
+        }
+      }
+    }
+  }
+
+  in_port(L3TriggerQueue_in, TriggerMsg, L3triggerQueue, rank=4) {
+    if (L3TriggerQueue_in.isReady(clockEdge())) {
+      peek(L3TriggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == TriggerType:L3Hit) {
+          trigger(Event:L3Hit, in_msg.addr, entry, tbe);
+        } else {
+          error("Unknown trigger msg");
+        }
+      }
+    }
+  }
+
+  // Unblock Network
+  in_port(unblockNetwork_in, UnblockMsg, unblockFromCores, rank=3) {
+    if (unblockNetwork_in.isReady(clockEdge())) {
+      peek(unblockNetwork_in, UnblockMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        trigger(Event:CoreUnblock, in_msg.addr, entry, tbe);
+      }
+    }
+  }
+
+  // Core response network
+  in_port(responseNetwork_in, ResponseMsg, responseFromCores, rank=2) {
+    if (responseNetwork_in.isReady(clockEdge())) {
+      peek(responseNetwork_in, ResponseMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+          trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:CPUData) {
+          trigger(Event:CPUData, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
+            trigger(Event:StaleWB, in_msg.addr, entry, tbe);
+        } else {
+          error("Unexpected response type");
+        }
+      }
+    }
+  }
+
+  // off-chip memory request/response is done
+  in_port(memQueue_in, MemoryMsg, responseFromMemory, rank=1) {
+    if (memQueue_in.isReady(clockEdge())) {
+      peek(memQueue_in, MemoryMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == MemoryRequestType:MEMORY_READ) {
+          trigger(Event:MemData, in_msg.addr, entry, tbe);
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+        } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) {
+          trigger(Event:WBAck, in_msg.addr, entry, tbe); // ignore WBAcks, don't care about them.
+        } else {
+          DPRINTF(RubySlicc, "%s\n", in_msg.Type);
+          error("Invalid message");
+        }
+      }
+    }
+  }
+
+  in_port(requestNetwork_in, CPURequestMsg, requestFromCores, rank=0) {
+    if (requestNetwork_in.isReady(clockEdge())) {
+      peek(requestNetwork_in, CPURequestMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (inclusiveDir && isNotPresentProbeFilter(in_msg.addr)) {
+            Addr victim := ProbeFilterMemory.cacheProbe(in_msg.addr);
+            tbe := TBEs.lookup(victim);
+            entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(victim));
+            trigger(Event:PF_Repl, victim, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
+          trigger(Event:RdBlk, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+          trigger(Event:RdBlkS, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+          trigger(Event:RdBlkM, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+          trigger(Event:WriteThrough, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+          trigger(Event:Atomic, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+          if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+            DPRINTF(RubySlicc, "Dropping VicDirty for address %s\n", in_msg.addr);
+            trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+          } else {
+            DPRINTF(RubySlicc, "Got VicDirty from %s on %s\n", in_msg.Requestor, in_msg.addr);
+            trigger(Event:VicDirty, in_msg.addr, entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+          if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+            DPRINTF(RubySlicc, "Dropping VicClean for address %s\n", in_msg.addr);
+            trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+          } else {
+            DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr);
+            trigger(Event:VicClean, in_msg.addr, entry, tbe);
+          }
+        } else {
+          error("Bad request message type");
+        }
+      }
+    }
+  }
+
+  // Actions
+  action(s_sendResponseS, "s", desc="send Shared response") {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(tbe.OriginalRequestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := false;
+      out_msg.State := CoherenceState:Shared;
+      out_msg.InitialRequestTime := tbe.InitialRequestTime;
+      out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+      out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(es_sendResponseES, "es", desc="send Exclusive or Shared response") {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(tbe.OriginalRequestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Cached) {
+        out_msg.State := CoherenceState:Shared;
+      } else {
+        out_msg.State := CoherenceState:Exclusive;
+      }
+      out_msg.InitialRequestTime := tbe.InitialRequestTime;
+      out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+      out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  // write-through and atomics do not send an unblock ack back to the
+  // directory. Hence, directory has to generate a self unblocking
+  // message. Additionally, write through's does not require data
+  // in its response. Hence, write through is treated seperately from
+  // write-back and atomics
+  action(m_sendResponseM, "m", desc="send Modified response") {
+    if (tbe.wtData) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:UnblockWriteThrough;
+      }
+    }else{
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        if (tbe.L3Hit) {
+          out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+        } else {
+          out_msg.Sender := machineID;
+        }
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := tbe.Dirty;
+        out_msg.State := CoherenceState:Modified;
+        out_msg.CtoD := false;
+        out_msg.InitialRequestTime := tbe.InitialRequestTime;
+        out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+        out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+        out_msg.OriginalResponder := tbe.LastSender;
+        if(tbe.atomicData){
+          out_msg.WTRequestor := tbe.WTRequestor;
+        }
+        out_msg.L3Hit := tbe.L3Hit;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+      if (tbe.atomicData) {
+        enqueue(triggerQueue_out, TriggerMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := TriggerType:UnblockWriteThrough;
+        }
+      }
+    }
+  }
+
+  action(c_sendResponseCtoD, "c", desc="send CtoD Ack") {
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        out_msg.Dirty := false;
+        out_msg.State := CoherenceState:Modified;
+        out_msg.CtoD := true;
+        out_msg.InitialRequestTime := tbe.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+  }
+
+  action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysWBAck;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(l_queueMemWBReq, "lq", desc="Write WB data to memory") {
+    peek(responseNetwork_in, ResponseMsg) {
+      queueMemoryWrite(machineID, address, to_memory_controller_latency,
+                       in_msg.DataBlk);
+    }
+  }
+
+  action(l_queueMemRdReq, "lr", desc="Read data from memory") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) {
+          out_msg.addr := address;
+          out_msg.Type := TriggerType:L3Hit;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        tbe.DataBlk := entry.DataBlk;
+        tbe.LastSender := entry.LastSender;
+        tbe.L3Hit := true;
+        tbe.MemData := true;
+        L3CacheMemory.deallocate(address);
+      } else {
+        queueMemoryRead(machineID, address, to_memory_controller_latency);
+      }
+    }
+  }
+
+  action(dc_probeInvCoreData, "dc", desc="probe inv cores, return data") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbInv;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        if(isCPUSharer(address)) {
+          out_msg.Destination.broadcast(MachineType:CorePair);  // won't be realistic for multisocket
+        }
+
+        // add relevant TCC node to list. This replaces all TCPs and SQCs
+        if(isGPUSharer(address)) {
+          if ((in_msg.Type == CoherenceRequestType:WriteThrough ||
+               in_msg.Type == CoherenceRequestType:Atomic) &&
+               in_msg.NoWriteConflict) {
+          // Don't Include TCCs unless there was write-CAB conflict in the TCC
+          } else if(noTCCdir) {
+            out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                    TCC_select_low_bit, TCC_select_num_bits));
+          } else {
+	        out_msg.Destination.add(map_Address_to_TCCdir(address));
+          }
+        }
+        out_msg.Destination.remove(in_msg.Requestor);
+        tbe.NumPendingAcks := out_msg.Destination.count();
+        if (tbe.NumPendingAcks == 0) {
+          enqueue(triggerQueue_out, TriggerMsg, 1) {
+            out_msg.addr := address;
+            out_msg.Type := TriggerType:AcksComplete;
+          }
+        }
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+        APPEND_TRANSITION_COMMENT(" dc: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(bp_backProbe, "bp", desc="back probe") {
+    enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := ProbeRequestType:PrbInv;
+      out_msg.ReturnData := true;
+      out_msg.MessageSize := MessageSizeType:Control;
+      if(isCPUSharer(address)) {
+        // won't be realistic for multisocket
+        out_msg.Destination.broadcast(MachineType:CorePair);
+      }
+      // add relevant TCC node to the list. This replaces all TCPs and SQCs
+      if(isGPUSharer(address)) {
+        if (noTCCdir) {
+          //Don't need to notify TCC about reads
+        } else {
+          out_msg.Destination.add(map_Address_to_TCCdir(address));
+          tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+        }
+        if (noTCCdir && CAB_TCC) {
+          out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                  TCC_select_low_bit, TCC_select_num_bits));
+        }
+      }
+      tbe.NumPendingAcks := out_msg.Destination.count();
+      if (tbe.NumPendingAcks == 0) {
+        enqueue(triggerQueue_out, TriggerMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := TriggerType:AcksComplete;
+        }
+      }
+      DPRINTF(RubySlicc, "%s\n", (out_msg));
+      APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
+      APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+      APPEND_TRANSITION_COMMENT(" - back probe");
+      tbe.ProbeRequestStartTime := curCycle();
+    }
+  }
+
+  action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") {
+    peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbDowngrade;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        if(isCPUSharer(address)) {
+          out_msg.Destination.broadcast(MachineType:CorePair);  // won't be realistic for multisocket
+        }
+        // add relevant TCC node to the list. This replaces all TCPs and SQCs
+        if(isGPUSharer(address)) {
+          if (noTCCdir) {
+            //Don't need to notify TCC about reads
+          } else {
+	    out_msg.Destination.add(map_Address_to_TCCdir(address));
+            tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+          }
+          if (noTCCdir && CAB_TCC) {
+            out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                    TCC_select_low_bit, TCC_select_num_bits));
+          }
+        }
+        out_msg.Destination.remove(in_msg.Requestor);
+        tbe.NumPendingAcks := out_msg.Destination.count();
+        if (tbe.NumPendingAcks == 0) {
+          enqueue(triggerQueue_out, TriggerMsg, 1) {
+            out_msg.addr := address;
+            out_msg.Type := TriggerType:AcksComplete;
+          }
+        }
+        DPRINTF(RubySlicc, "%s\n", (out_msg));
+        APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(ic_probeInvCore, "ic", desc="probe invalidate core, no return data needed") {
+    peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbInv;
+        out_msg.ReturnData := false;
+        out_msg.MessageSize := MessageSizeType:Control;
+        if(isCPUSharer(address)) {
+          out_msg.Destination.broadcast(MachineType:CorePair);  // won't be realistic for multisocket
+        }
+
+        // add relevant TCC node to the list. This replaces all TCPs and SQCs
+        if(isGPUSharer(address)) {
+          if (noTCCdir) {
+              out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                TCC_select_low_bit, TCC_select_num_bits));
+          } else {
+	          out_msg.Destination.add(map_Address_to_TCCdir(address));
+          }
+        }
+        out_msg.Destination.remove(in_msg.Requestor);
+        tbe.NumPendingAcks := out_msg.Destination.count();
+        if (tbe.NumPendingAcks == 0) {
+          enqueue(triggerQueue_out, TriggerMsg, 1) {
+            out_msg.addr := address;
+            out_msg.Type := TriggerType:AcksComplete;
+          }
+        }
+        APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(sm_setMRU, "sm", desc="set probe filter entry as MRU") {
+    ProbeFilterMemory.setMRU(address);
+  }
+
+  action(d_writeDataToMemory, "d", desc="Write data to memory") {
+    peek(responseNetwork_in, ResponseMsg) {
+      getDirectoryEntry(address).DataBlk := in_msg.DataBlk;
+      DPRINTF(RubySlicc, "Writing Data: %s to address %s\n", in_msg.DataBlk,
+              in_msg.addr);
+    }
+  }
+
+  action(te_allocateTBEForEviction, "te", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+      tbe.writeMask.clear();
+      tbe.wtData := false;
+      tbe.atomicData := false;
+      tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs
+      tbe.Dirty := false;
+      tbe.NumPendingAcks := 0;
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    peek(requestNetwork_in, CPURequestMsg) {
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.writeMask.clear();
+        tbe.writeMask.orMask(in_msg.writeMask);
+        tbe.wtData := true;
+        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.LastSender := in_msg.Requestor;
+      }
+      if (in_msg.Type == CoherenceRequestType:Atomic) {
+        tbe.writeMask.clear();
+        tbe.writeMask.orMask(in_msg.writeMask);
+        tbe.atomicData := true;
+        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.LastSender := in_msg.Requestor;
+      }
+      tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs
+      tbe.Dirty := false;
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.DataBlk.copyPartial(in_msg.DataBlk,tbe.writeMask);
+        tbe.Dirty := false;
+      }
+      tbe.OriginalRequestor := in_msg.Requestor;
+      tbe.NumPendingAcks := 0;
+      tbe.Cached := in_msg.ForceShared;
+      tbe.InitialRequestTime := in_msg.InitialRequestTime;
+    }
+  }
+
+  action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+    if (tbe.Dirty == false) {
+        getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    }
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(wd_writeBackData, "wd", desc="Write back data if needed") {
+    if (tbe.wtData) {
+      DataBlock tmp := getDirectoryEntry(address).DataBlk;
+      tmp.copyPartial(tbe.DataBlk,tbe.writeMask);
+      tbe.DataBlk := tmp;
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    } else if (tbe.atomicData) {
+      tbe.DataBlk.atomicPartial(getDirectoryEntry(address).DataBlk,
+                                tbe.writeMask);
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    } else if (tbe.Dirty == false) {
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    }
+  }
+
+  action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") {
+    peek(memQueue_in, MemoryMsg) {
+      if (tbe.wtData == true) {
+        // DO Nothing (already have the directory data)
+      } else if (tbe.Dirty == false) {
+        tbe.DataBlk := getDirectoryEntry(address).DataBlk;
+      }
+      tbe.MemData := true;
+    }
+  }
+
+  action(y_writeProbeDataToTBE, "y", desc="write Probe Data to TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.Dirty) {
+        DPRINTF(RubySlicc, "Got dirty data for %s from %s\n", address, in_msg.Sender);
+        DPRINTF(RubySlicc, "Data is %s\n", in_msg.DataBlk);
+        if (tbe.wtData) {
+          DataBlock tmp := in_msg.DataBlk;
+          tmp.copyPartial(tbe.DataBlk,tbe.writeMask);
+          tbe.DataBlk := tmp;
+        } else if (tbe.Dirty) {
+          if(tbe.atomicData == false && tbe.wtData == false) {
+            DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender);
+            assert(tbe.DataBlk == in_msg.DataBlk);  // in case of double data
+          }
+        } else {
+          tbe.DataBlk := in_msg.DataBlk;
+          tbe.Dirty := in_msg.Dirty;
+          tbe.LastSender := in_msg.Sender;
+        }
+      }
+      if (in_msg.Hit) {
+        tbe.Cached := true;
+      }
+    }
+  }
+
+  action(mwc_markSinkWriteCancel, "mwc", desc="Mark to sink impending VicDirty") {
+    peek(responseNetwork_in, ResponseMsg) {
+      DPRINTF(RubySlicc, "Write cancel bit set on address %s\n", address);
+      getDirectoryEntry(address).VicDirtyIgnore.add(in_msg.Sender);
+      APPEND_TRANSITION_COMMENT(" setting bit to sink VicDirty ");
+    }
+  }
+
+  action(x_decrementAcks, "x", desc="decrement Acks pending") {
+    tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+    APPEND_TRANSITION_COMMENT(" Acks remaining: ");
+    APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+  }
+
+  action(o_checkForCompletion, "o", desc="check for ack completion") {
+    if (tbe.NumPendingAcks == 0) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:AcksComplete;
+      }
+    }
+    APPEND_TRANSITION_COMMENT(" Check: Acks remaining: ");
+    APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+  }
+
+  action(rv_removeVicDirtyIgnore, "rv", desc="Remove ignored core") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor);
+    }
+  }
+
+  action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+        entry.DataBlk := in_msg.DataBlk;
+        entry.LastSender := in_msg.Sender;
+      } else {
+        if (L3CacheMemory.cacheAvail(address) == false) {
+          Addr victim := L3CacheMemory.cacheProbe(address);
+          CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+                                                 L3CacheMemory.lookup(victim));
+          queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+                           victim_entry.DataBlk);
+          L3CacheMemory.deallocate(victim);
+        }
+        assert(L3CacheMemory.cacheAvail(address));
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+        entry.DataBlk := in_msg.DataBlk;
+
+        entry.LastSender := in_msg.Sender;
+      }
+    }
+  }
+
+  action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") {
+    if ((tbe.wtData || tbe.atomicData) && useL3OnWT) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+        entry.DataBlk := tbe.DataBlk;
+        entry.LastSender := tbe.LastSender;
+      } else {
+        if (L3CacheMemory.cacheAvail(address) == false) {
+          Addr victim := L3CacheMemory.cacheProbe(address);
+          CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+                                                 L3CacheMemory.lookup(victim));
+          queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+                           victim_entry.DataBlk);
+          L3CacheMemory.deallocate(victim);
+        }
+        assert(L3CacheMemory.cacheAvail(address));
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+        entry.DataBlk := tbe.DataBlk;
+        entry.LastSender := tbe.LastSender;
+      }
+    }
+  }
+
+  action(apf_allocateProbeFilterEntry, "apf", desc="Allocate probe filte entry") {
+    if (!ProbeFilterMemory.isTagPresent(address)) {
+        if (inclusiveDir) {
+            assert(ProbeFilterMemory.cacheAvail(address));
+        } else if (ProbeFilterMemory.cacheAvail(address) == false) {
+          Addr victim := ProbeFilterMemory.cacheProbe(address);
+          ProbeFilterMemory.deallocate(victim);
+        }
+        assert(ProbeFilterMemory.cacheAvail(address));
+        CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.allocate(address, new CacheEntry));
+        APPEND_TRANSITION_COMMENT(" allocating a new probe filter entry");
+        entry.pfState := ProbeFilterState:NT;
+        if (inclusiveDir) {
+          entry.pfState := ProbeFilterState:T;
+        }
+        entry.isOnCPU := false;
+        entry.isOnGPU := false;
+    }
+  }
+
+  action(mpfe_markPFEntryForEviction, "mpfe", desc="Mark this PF entry is being evicted") {
+    assert(ProbeFilterMemory.isTagPresent(address));
+    CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address));
+    entry.pfState := ProbeFilterState:B;
+    peek(requestNetwork_in, CPURequestMsg) {
+      tbe.demandAddress := in_msg.addr;
+    }
+  }
+
+  action(we_wakeUpEvictionDependents, "we", desc="Wake up requests waiting for demand address and victim address") {
+    wakeUpBuffers(address);
+    wakeUpBuffers(tbe.demandAddress);
+  }
+
+  action(dpf_deallocateProbeFilter, "dpf", desc="deallocate PF entry") {
+    assert(ProbeFilterMemory.isTagPresent(address));
+    ProbeFilterMemory.deallocate(address);
+  }
+
+  action(upf_updateProbeFilter, "upf", desc="") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      assert(ProbeFilterMemory.isTagPresent(address));
+      CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address));
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        entry.pfState := ProbeFilterState:T;
+        entry.isOnCPU := false;
+        entry.isOnGPU := false;
+      } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+        entry.pfState := ProbeFilterState:T;
+        entry.isOnCPU := false;
+        entry.isOnGPU := false;
+      } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+        entry.pfState := ProbeFilterState:T;
+        entry.isOnCPU := false;
+        entry.isOnGPU := false;
+      } else if (in_msg.Type == CoherenceRequestType:CtoD) {
+        entry.pfState := ProbeFilterState:T;
+        entry.isOnCPU := false;
+        entry.isOnGPU := false;
+      }
+      if(machineIDToMachineType(in_msg.Requestor) == MachineType:CorePair) {
+        entry.isOnCPU := true;
+      } else {
+        entry.isOnGPU := true;
+      }
+    }
+  }
+
+  action(rmcd_removeSharerConditional, "rmcd", desc="remove sharer from probe Filter, conditional") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      if (ProbeFilterMemory.isTagPresent(address)) {
+        CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address));
+        if(machineIDToMachineType(in_msg.Requestor) == MachineType:CorePair) {//CorePair has inclusive L2
+          if (in_msg.Type == CoherenceRequestType:VicDirty) {
+            entry.isOnCPU := false;
+          } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+            entry.isOnCPU := false;
+          }
+        }
+      }
+    }
+  }
+
+  action(sf_setForwardReqTime, "sf", desc="...") {
+    tbe.ForwardRequestTime := curCycle();
+  }
+
+  action(dl_deallocateL3, "dl", desc="deallocate the L3 block") {
+    L3CacheMemory.deallocate(address);
+  }
+
+  action(p_popRequestQueue, "p", desc="pop request queue") {
+    requestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="pop response queue") {
+    responseNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pm_popMemQueue, "pm", desc="pop mem queue") {
+    memQueue_in.dequeue(clockEdge());
+  }
+
+  action(pt_popTriggerQueue, "pt", desc="pop trigger queue") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(ptl_popTriggerQueue, "ptl", desc="pop L3 trigger queue") {
+    L3TriggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(pu_popUnblockQueue, "pu", desc="pop unblock queue") {
+    unblockNetwork_in.dequeue(clockEdge());
+  }
+
+  action(zz_recycleRequestQueue, "zz", desc="recycle request queue") {
+    requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(yy_recycleResponseQueue, "yy", desc="recycle response queue") {
+    responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") {
+    stall_and_wait(requestNetwork_in, address);
+  }
+
+  action(wa_wakeUpDependents, "wa", desc="Wake up any requests waiting for this address") {
+    wakeUpBuffers(address);
+  }
+
+  action(wa_wakeUpAllDependents, "waa", desc="Wake up any requests waiting for this region") {
+    wakeUpAllBuffers();
+  }
+
+  action(z_stall, "z", desc="...") {
+  }
+
+  // TRANSITIONS
+  transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) {
+    st_stallAndWaitRequest;
+  }
+
+  // It may be possible to save multiple invalidations here!
+  transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, {Atomic, WriteThrough}) {
+    st_stallAndWaitRequest;
+  }
+
+
+  // transitions from U
+  transition(U, PF_Repl, B_P) {PFTagArrayRead, PFTagArrayWrite}{
+    te_allocateTBEForEviction;
+    apf_allocateProbeFilterEntry;
+    bp_backProbe;
+    sm_setMRU;
+    mpfe_markPFEntryForEviction;
+  }
+
+  transition(U, {RdBlkS}, BS_PM) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite} {
+    t_allocateTBE;
+    apf_allocateProbeFilterEntry;
+    l_queueMemRdReq;
+    sc_probeShrCoreData;
+    sm_setMRU;
+    upf_updateProbeFilter;
+    p_popRequestQueue;
+  }
+
+  transition(U, WriteThrough, BM_PM) {L3TagArrayRead, L3TagArrayWrite, PFTagArrayRead, PFTagArrayWrite} {
+    t_allocateTBE;
+    apf_allocateProbeFilterEntry;
+    w_sendResponseWBAck;
+    l_queueMemRdReq;
+    dc_probeInvCoreData;
+    sm_setMRU;
+    upf_updateProbeFilter;
+    p_popRequestQueue;
+  }
+
+  transition(U, Atomic, BM_PM) {L3TagArrayRead, L3TagArrayWrite, PFTagArrayRead, PFTagArrayWrite} {
+    t_allocateTBE;
+    apf_allocateProbeFilterEntry;
+    l_queueMemRdReq;
+    dc_probeInvCoreData;
+    sm_setMRU;
+    upf_updateProbeFilter;
+    p_popRequestQueue;
+  }
+
+  transition(U, {RdBlkM}, BM_PM) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite} {
+    t_allocateTBE;
+    apf_allocateProbeFilterEntry;
+    l_queueMemRdReq;
+    dc_probeInvCoreData;
+    sm_setMRU;
+    upf_updateProbeFilter;
+    p_popRequestQueue;
+  }
+
+  transition(U, RdBlk, B_PM) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite}{
+    t_allocateTBE;
+    apf_allocateProbeFilterEntry;
+    l_queueMemRdReq;
+    sc_probeShrCoreData;
+    sm_setMRU;
+    upf_updateProbeFilter;
+    p_popRequestQueue;
+  }
+
+  transition(U, CtoD, BP) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite} {
+    t_allocateTBE;
+    apf_allocateProbeFilterEntry;
+    ic_probeInvCore;
+    sm_setMRU;
+    upf_updateProbeFilter;
+    p_popRequestQueue;
+  }
+
+  transition(U, VicDirty, BL) {L3TagArrayRead} {
+    t_allocateTBE;
+    w_sendResponseWBAck;
+    rmcd_removeSharerConditional;
+    p_popRequestQueue;
+  }
+
+  transition(U, VicClean, BL) {L3TagArrayRead} {
+    t_allocateTBE;
+    w_sendResponseWBAck;
+    rmcd_removeSharerConditional;
+    p_popRequestQueue;
+  }
+
+  transition(BL, {VicDirty, VicClean}) {
+    zz_recycleRequestQueue;
+  }
+
+  transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} {
+    d_writeDataToMemory;
+    al_allocateL3Block;
+    wa_wakeUpDependents;
+    dt_deallocateTBE;
+    //l_queueMemWBReq;  // why need an ack?  esp. with DRAMSim, just put it in queue no ack needed
+    pr_popResponseQueue;
+  }
+
+  transition(BL, StaleWB, U) {L3TagArrayWrite} {
+    dt_deallocateTBE;
+    wa_wakeUpAllDependents;
+    pr_popResponseQueue;
+  }
+
+  transition({B, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P}, {VicDirty, VicClean}) {
+    z_stall;
+  }
+
+  transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, WBAck) {
+    pm_popMemQueue;
+  }
+
+  transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, PF_Repl) {
+    zz_recycleRequestQueue;
+  }
+
+  transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, StaleVicDirty) {
+    rv_removeVicDirtyIgnore;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition({B}, CoreUnblock, U) {
+    wa_wakeUpDependents;
+    pu_popUnblockQueue;
+  }
+
+  transition(B, UnblockWriteThrough, U) {
+    wa_wakeUpDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(BS_PM, MemData, BS_Pm) {} {
+    mt_writeMemDataToTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BM_PM, MemData, BM_Pm){} {
+    mt_writeMemDataToTBE;
+    pm_popMemQueue;
+  }
+
+  transition(B_PM, MemData, B_Pm){} {
+    mt_writeMemDataToTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BS_PM, L3Hit, BS_Pm) {} {
+    ptl_popTriggerQueue;
+  }
+
+  transition(BM_PM, L3Hit, BM_Pm) {} {
+    ptl_popTriggerQueue;
+  }
+
+  transition(B_PM, L3Hit, B_Pm) {} {
+    ptl_popTriggerQueue;
+  }
+
+  transition(BS_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BM_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(B_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BS_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} {
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition(BM_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} {
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition(B_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} {
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition({BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, BP}, CPUPrbResp) {
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    o_checkForCompletion;
+    pr_popResponseQueue;
+  }
+
+  transition(BS_PM, ProbeAcksComplete, BS_M) {} {
+    sf_setForwardReqTime;
+    pt_popTriggerQueue;
+  }
+
+  transition(BM_PM, ProbeAcksComplete, BM_M) {} {
+    sf_setForwardReqTime;
+    pt_popTriggerQueue;
+  }
+
+  transition(B_PM, ProbeAcksComplete, B_M){} {
+    sf_setForwardReqTime;
+    pt_popTriggerQueue;
+  }
+
+  transition(BS_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    sf_setForwardReqTime;
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(BM_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    sf_setForwardReqTime;
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(B_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    sf_setForwardReqTime;
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(B_P, ProbeAcksComplete, U) {
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    we_wakeUpEvictionDependents;
+    dpf_deallocateProbeFilter;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(BP, ProbeAcksComplete, B){L3TagArrayWrite, L3TagArrayWrite} {
+    sf_setForwardReqTime;
+    c_sendResponseCtoD;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base.slicc b/src/mem/protocol/MOESI_AMD_Base.slicc
new file mode 100644
index 000000000..b38145246
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base.slicc
@@ -0,0 +1,6 @@
+protocol "MOESI_AMD_Base";
+include "RubySlicc_interfaces.slicc";
+include "MOESI_AMD_Base-msg.sm";
+include "MOESI_AMD_Base-CorePair.sm";
+include "MOESI_AMD_Base-L3cache.sm";
+include "MOESI_AMD_Base-dir.sm";
diff --git a/src/mem/protocol/RubySlicc_ComponentMapping.sm b/src/mem/protocol/RubySlicc_ComponentMapping.sm
index a72492b42..e1d7c4399 100644
--- a/src/mem/protocol/RubySlicc_ComponentMapping.sm
+++ b/src/mem/protocol/RubySlicc_ComponentMapping.sm
@@ -37,7 +37,10 @@ MachineID mapAddressToRange(Addr addr, MachineType type,
 NetDest broadcast(MachineType type);
 MachineID map_Address_to_DMA(Addr addr);
 MachineID map_Address_to_Directory(Addr addr);
+MachineID map_Address_to_RegionDir(Addr addr);
 NodeID map_Address_to_DirectoryNode(Addr addr);
+MachineID map_Address_to_TCCdir(Addr addr);
+NodeID map_Address_to_TCCdirNode(Addr addr);
 NodeID machineIDToNodeID(MachineID machID);
 NodeID machineIDToVersion(MachineID machID);
 MachineType machineIDToMachineType(MachineID machID);
diff --git a/src/mem/protocol/RubySlicc_Exports.sm b/src/mem/protocol/RubySlicc_Exports.sm
index 5ee26d65c..c743ebe28 100644
--- a/src/mem/protocol/RubySlicc_Exports.sm
+++ b/src/mem/protocol/RubySlicc_Exports.sm
@@ -62,7 +62,7 @@ bool testAndWrite(Addr addr, DataBlock datablk, Packet *pkt);
 
 // AccessPermission
 // The following five states define the access permission of all memory blocks.
-// These permissions have multiple uses.  They coordinate locking and 
+// These permissions have multiple uses.  They coordinate locking and
 // synchronization primitives, as well as enable functional accesses.
 // One should not need to add any additional permission values and it is very
 // risky to do so.
@@ -73,7 +73,7 @@ enumeration(AccessPermission, desc="...", default="AccessPermission_NotPresent")
   Read_Write, desc="block is Read/Write";
 
   // Possibly Invalid data
-  // The maybe stale permission indicates that accordingly to the protocol, 
+  // The maybe stale permission indicates that accordingly to the protocol,
   // there is no guarantee the block contains valid data.  However, functional
   // writes should update the block because a dataless PUT request may
   // revalidate the block's data.
@@ -227,6 +227,13 @@ enumeration(MachineType, desc="...", default="MachineType_NULL") {
     Collector,   desc="Collector Mach";
     L1Cache_wCC, desc="L1 Cache Mach to track cache-to-cache transfer (used for miss latency profile)";
     L2Cache_wCC, desc="L2 Cache Mach to track cache-to-cache transfer (used for miss latency profile)";
+    CorePair,    desc="Cache Mach (2 cores, Private L1Ds, Shared L1I & L2)";
+    TCP,         desc="GPU L1 Data Cache (Texture Cache per Pipe)";
+    TCC,         desc="GPU L2 Shared Cache (Texture Cache per Channel)";
+    TCCdir,      desc="Directory at the GPU L2 Cache (TCC)";
+    SQC,         desc="GPU L1 Instr Cache (Sequencer Cache)";
+    RegionDir,   desc="Region-granular directory";
+    RegionBuffer,desc="Region buffer for CPU and GPU";
     NULL,        desc="null mach type";
 }
 
diff --git a/src/mem/protocol/RubySlicc_Types.sm b/src/mem/protocol/RubySlicc_Types.sm
index a6c57e1b0..b8d284725 100644
--- a/src/mem/protocol/RubySlicc_Types.sm
+++ b/src/mem/protocol/RubySlicc_Types.sm
@@ -31,8 +31,8 @@
 
 //
 // **PLEASE NOTE!**  When adding objects to this file you must also add a line
-// in the src/mem/ruby/SConscript file.  Otherwise the external object's .hh 
-// file will not be copied to the protocol directory and you will encounter a 
+// in the src/mem/ruby/SConscript file.  Otherwise the external object's .hh
+// file will not be copied to the protocol directory and you will encounter a
 // undefined declaration error.
 //
 
@@ -95,6 +95,8 @@ structure (NetDest, external = "yes", non_obj="yes") {
   bool intersectionIsEmpty(Set);
   bool intersectionIsEmpty(NetDest);
   MachineID smallestElement(MachineType);
+  NetDest OR(NetDest);
+  NetDest AND(NetDest);
 }
 
 structure (Sequencer, external = "yes") {
@@ -117,6 +119,44 @@ structure (Sequencer, external = "yes") {
   void invalidateSC(Addr);
 }
 
+structure (GPUCoalescer, external = "yes") {
+  void readCallback(Addr, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles, bool);
+  void writeCallback(Addr, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles, bool);
+  void checkCoherence(Addr);
+  void evictionCallback(Addr);
+  void recordCPReadCallBack(MachineID, MachineID);
+  void recordCPWriteCallBack(MachineID, MachineID);
+}
+
+structure (VIPERCoalescer, external = "yes") {
+  void readCallback(Addr, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles, bool);
+  void writeCallback(Addr, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles, bool);
+  void invCallback(Addr);
+  void wbCallback(Addr);
+  void checkCoherence(Addr);
+  void evictionCallback(Addr);
+}
+
 structure(RubyRequest, desc="...", interface="Message", external="yes") {
   Addr LineAddress,       desc="Line address for this request";
   Addr PhysicalAddress,   desc="Physical address for this request";
@@ -161,6 +201,7 @@ structure (CacheMemory, external = "yes") {
   Cycles getTagLatency();
   Cycles getDataLatency();
   void setMRU(Addr);
+  void setMRU(Addr, int);
   void setMRU(AbstractCacheEntry);
   void recordRequestType(CacheRequestType, Addr);
   bool checkResourceAvailable(CacheResourceType, Addr);
diff --git a/src/mem/protocol/SConsopts b/src/mem/protocol/SConsopts
index ca432a73e..47b36e276 100644
--- a/src/mem/protocol/SConsopts
+++ b/src/mem/protocol/SConsopts
@@ -33,6 +33,11 @@ import os
 Import('*')
 
 all_protocols.extend([
+    'GPU_VIPER',
+    'GPU_VIPER_Baseline',
+    'GPU_VIPER_Region',
+    'GPU_RfO',
+    'MOESI_AMD_Base',
     'MESI_Two_Level',
     'MESI_Three_Level',
     'MI_example',
diff --git a/src/mem/ruby/SConscript b/src/mem/ruby/SConscript
index 16e932432..82a16c9b0 100644
--- a/src/mem/ruby/SConscript
+++ b/src/mem/ruby/SConscript
@@ -124,13 +124,20 @@ MakeInclude('common/Set.hh')
 MakeInclude('common/WriteMask.hh')
 MakeInclude('filters/AbstractBloomFilter.hh')
 MakeInclude('network/MessageBuffer.hh')
-MakeInclude('structures/Prefetcher.hh')
 MakeInclude('structures/CacheMemory.hh')
-MakeInclude('system/DMASequencer.hh')
 MakeInclude('structures/DirectoryMemory.hh')
-MakeInclude('structures/WireBuffer.hh')
 MakeInclude('structures/PerfectCacheMemory.hh')
 MakeInclude('structures/PersistentTable.hh')
-MakeInclude('system/Sequencer.hh')
+MakeInclude('structures/Prefetcher.hh')
 MakeInclude('structures/TBETable.hh')
 MakeInclude('structures/TimerTable.hh')
+MakeInclude('structures/WireBuffer.hh')
+MakeInclude('system/DMASequencer.hh')
+MakeInclude('system/Sequencer.hh')
+
+# External types : Group "mem/protocol" : include "header.hh" to the bottom
+# of this MakeIncludes if it is referenced as
+# <# include "mem/protocol/header.hh"> in any file
+# generated_dir = Dir('../protocol')
+MakeInclude('system/GPUCoalescer.hh')
+MakeInclude('system/VIPERCoalescer.hh')
diff --git a/src/mem/ruby/profiler/Profiler.cc b/src/mem/ruby/profiler/Profiler.cc
index b3b37e5a6..7d3f20982 100644
--- a/src/mem/ruby/profiler/Profiler.cc
+++ b/src/mem/ruby/profiler/Profiler.cc
@@ -269,7 +269,7 @@ Profiler::collateStats()
                 it != m_ruby_system->m_abstract_controls[i].end(); ++it) {
 
             AbstractController *ctr = (*it).second;
-            Sequencer *seq = ctr->getSequencer();
+            Sequencer *seq = ctr->getCPUSequencer();
             if (seq != NULL) {
                 m_outstandReqHist.add(seq->getOutstandReqHist());
             }
@@ -282,7 +282,7 @@ Profiler::collateStats()
                 it != m_ruby_system->m_abstract_controls[i].end(); ++it) {
 
             AbstractController *ctr = (*it).second;
-            Sequencer *seq = ctr->getSequencer();
+            Sequencer *seq = ctr->getCPUSequencer();
             if (seq != NULL) {
                 // add all the latencies
                 m_latencyHist.add(seq->getLatencyHist());
diff --git a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
index 926556781..cbd068c04 100644
--- a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
+++ b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
@@ -56,6 +56,12 @@ class AbstractCacheEntry : public AbstractEntry
     virtual DataBlock& getDataBlk()
     { panic("getDataBlk() not implemented!"); }
 
+    int validBlocks;
+    virtual int& getNumValidBlocks()
+    {
+        return validBlocks;
+    }
+
     // Functions for locking and unlocking the cache entry.  These are required
     // for supporting atomic memory accesses.
     void setLocked(int context);
diff --git a/src/mem/ruby/slicc_interface/AbstractController.cc b/src/mem/ruby/slicc_interface/AbstractController.cc
index 93fe50c88..458fde5bc 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.cc
+++ b/src/mem/ruby/slicc_interface/AbstractController.cc
@@ -200,6 +200,12 @@ AbstractController::unblock(Addr addr)
     }
 }
 
+bool
+AbstractController::isBlocked(Addr addr)
+{
+    return (m_block_map.count(addr) > 0);
+}
+
 BaseMasterPort &
 AbstractController::getMasterPort(const std::string &if_name,
                                   PortID idx)
diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh
index 383507eed..4488ee3f4 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.hh
+++ b/src/mem/ruby/slicc_interface/AbstractController.hh
@@ -73,6 +73,7 @@ class AbstractController : public MemObject, public Consumer
     // return instance name
     void blockOnQueue(Addr, MessageBuffer*);
     void unblock(Addr);
+    bool isBlocked(Addr);
 
     virtual MessageBuffer* getMandatoryQueue() const = 0;
     virtual MessageBuffer* getMemoryQueue() const = 0;
@@ -84,7 +85,7 @@ class AbstractController : public MemObject, public Consumer
     virtual void regStats();
 
     virtual void recordCacheTrace(int cntrl, CacheRecorder* tr) = 0;
-    virtual Sequencer* getSequencer() const = 0;
+    virtual Sequencer* getCPUSequencer() const = 0;
 
     //! These functions are used by ruby system to read/write the data blocks
     //! that exist with in the controller.
diff --git a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
index 46071335e..cdedc2e14 100644
--- a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
+++ b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
@@ -43,6 +43,12 @@ map_Address_to_DirectoryNode(Addr addr)
     return DirectoryMemory::mapAddressToDirectoryVersion(addr);
 }
 
+inline NodeID
+map_Address_to_TCCdirNode(Addr addr)
+{
+    return DirectoryMemory::mapAddressToDirectoryVersion(addr);
+}
+
 // used to determine the home directory
 // returns a value between 0 and total_directories_within_the_system
 inline MachineID
@@ -53,6 +59,22 @@ map_Address_to_Directory(Addr addr)
     return mach;
 }
 
+inline MachineID
+map_Address_to_RegionDir(Addr addr)
+{
+    MachineID mach = {MachineType_RegionDir,
+                      map_Address_to_DirectoryNode(addr)};
+    return mach;
+}
+
+inline MachineID
+map_Address_to_TCCdir(Addr addr)
+{
+    MachineID mach =
+        {MachineType_TCCdir, map_Address_to_TCCdirNode(addr)};
+    return mach;
+}
+
 inline NetDest
 broadcast(MachineType type)
 {
@@ -102,4 +124,11 @@ createMachineID(MachineType type, NodeID id)
     return mach;
 }
 
+inline MachineID
+MachineTypeAndNodeIDToMachineID(MachineType type, NodeID node)
+{
+    MachineID mach = {type, node};
+    return mach;
+}
+
 #endif  // __MEM_RUBY_SLICC_INTERFACE_COMPONENTMAPPINGS_HH__
diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc
index a8a3ba949..45fb85d05 100644
--- a/src/mem/ruby/structures/CacheMemory.cc
+++ b/src/mem/ruby/structures/CacheMemory.cc
@@ -35,6 +35,7 @@
 #include "mem/protocol/AccessPermission.hh"
 #include "mem/ruby/structures/CacheMemory.hh"
 #include "mem/ruby/system/RubySystem.hh"
+#include "mem/ruby/system/WeightedLRUPolicy.hh"
 
 using namespace std;
 
@@ -66,29 +67,27 @@ CacheMemory::CacheMemory(const Params *p)
     m_start_index_bit = p->start_index_bit;
     m_is_instruction_only_cache = p->is_icache;
     m_resource_stalls = p->resourceStalls;
+    m_block_size = p->block_size;  // may be 0 at this point. Updated in init()
 }
 
 void
 CacheMemory::init()
 {
-    m_cache_num_sets = (m_cache_size / m_cache_assoc) /
-        RubySystem::getBlockSizeBytes();
+    if (m_block_size == 0) {
+        m_block_size = RubySystem::getBlockSizeBytes();
+    }
+    m_cache_num_sets = (m_cache_size / m_cache_assoc) / m_block_size;
     assert(m_cache_num_sets > 1);
     m_cache_num_set_bits = floorLog2(m_cache_num_sets);
     assert(m_cache_num_set_bits > 0);
 
-    m_cache.resize(m_cache_num_sets);
-    for (int i = 0; i < m_cache_num_sets; i++) {
-        m_cache[i].resize(m_cache_assoc);
-        for (int j = 0; j < m_cache_assoc; j++) {
-            m_cache[i][j] = NULL;
-        }
-    }
+    m_cache.resize(m_cache_num_sets,
+                    std::vector<AbstractCacheEntry*>(m_cache_assoc, nullptr));
 }
 
 CacheMemory::~CacheMemory()
 {
-    if (m_replacementPolicy_ptr != NULL)
+    if (m_replacementPolicy_ptr)
         delete m_replacementPolicy_ptr;
     for (int i = 0; i < m_cache_num_sets; i++) {
         for (int j = 0; j < m_cache_assoc; j++) {
@@ -359,6 +358,37 @@ CacheMemory::setMRU(const AbstractCacheEntry *e)
 }
 
 void
+CacheMemory::setMRU(Addr address, int occupancy)
+{
+    int64_t cacheSet = addressToCacheSet(address);
+    int loc = findTagInSet(cacheSet, address);
+
+    if(loc != -1) {
+        if (m_replacementPolicy_ptr->useOccupancy()) {
+            (static_cast<WeightedLRUPolicy*>(m_replacementPolicy_ptr))->
+                touch(cacheSet, loc, curTick(), occupancy);
+        } else {
+            m_replacementPolicy_ptr->
+                touch(cacheSet, loc, curTick());
+        }
+    }
+}
+
+int
+CacheMemory::getReplacementWeight(int64_t set, int64_t loc)
+{
+    assert(set < m_cache_num_sets);
+    assert(loc < m_cache_assoc);
+    int ret = 0;
+    if(m_cache[set][loc] != NULL) {
+        ret = m_cache[set][loc]->getNumValidBlocks();
+        assert(ret >= 0);
+    }
+
+    return ret;
+}
+
+void
 CacheMemory::recordCacheContents(int cntrl, CacheRecorder* tr) const
 {
     uint64_t warmedUpBlocks = 0;
diff --git a/src/mem/ruby/structures/CacheMemory.hh b/src/mem/ruby/structures/CacheMemory.hh
index 72805b32b..5b30505d3 100644
--- a/src/mem/ruby/structures/CacheMemory.hh
+++ b/src/mem/ruby/structures/CacheMemory.hh
@@ -106,7 +106,8 @@ class CacheMemory : public SimObject
 
     // Set this address to most recently used
     void setMRU(Addr address);
-    // Set this entry to most recently used
+    void setMRU(Addr addr, int occupancy);
+    int getReplacementWeight(int64_t set, int64_t loc);
     void setMRU(const AbstractCacheEntry *e);
 
     // Functions for locking and unlocking cache lines corresponding to the
@@ -146,6 +147,7 @@ class CacheMemory : public SimObject
     Stats::Scalar numDataArrayStalls;
 
     int getCacheSize() const { return m_cache_size; }
+    int getCacheAssoc() const { return m_cache_assoc; }
     int getNumBlocks() const { return m_cache_num_sets * m_cache_assoc; }
     Addr getAddressAtIdx(int idx) const;
 
@@ -182,6 +184,7 @@ class CacheMemory : public SimObject
     int m_cache_assoc;
     int m_start_index_bit;
     bool m_resource_stalls;
+    int m_block_size;
 };
 
 std::ostream& operator<<(std::ostream& out, const CacheMemory& obj);
diff --git a/src/mem/ruby/structures/RubyCache.py b/src/mem/ruby/structures/RubyCache.py
index 4eb87ac74..9fc4726b0 100644
--- a/src/mem/ruby/structures/RubyCache.py
+++ b/src/mem/ruby/structures/RubyCache.py
@@ -42,6 +42,7 @@ class RubyCache(SimObject):
                          "")
     start_index_bit = Param.Int(6, "index start, default 6 for 64-byte line");
     is_icache = Param.Bool(False, "is instruction only cache");
+    block_size = Param.MemorySize("0B", "block size in bytes. 0 means default RubyBlockSize")
 
     dataArrayBanks = Param.Int(1, "Number of banks for the data array")
     tagArrayBanks = Param.Int(1, "Number of banks for the tag array")
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
new file mode 100644
index 000000000..db279bd3a
--- /dev/null
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "base/misc.hh"
+#include "base/str.hh"
+#include "config/the_isa.hh"
+
+#if THE_ISA == X86_ISA
+#include "arch/x86/insts/microldstop.hh"
+
+#endif // X86_ISA
+#include "mem/ruby/system/GPUCoalescer.hh"
+
+#include "cpu/testers/rubytest/RubyTester.hh"
+#include "debug/GPUCoalescer.hh"
+#include "debug/MemoryAccess.hh"
+#include "debug/ProtocolTrace.hh"
+#include "debug/RubyPort.hh"
+#include "debug/RubyStats.hh"
+#include "gpu-compute/shader.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/common/DataBlock.hh"
+#include "mem/ruby/common/SubBlock.hh"
+#include "mem/ruby/network/MessageBuffer.hh"
+#include "mem/ruby/profiler/Profiler.hh"
+#include "mem/ruby/slicc_interface/AbstractController.hh"
+#include "mem/ruby/slicc_interface/RubyRequest.hh"
+#include "mem/ruby/structures/CacheMemory.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "params/RubyGPUCoalescer.hh"
+
+using namespace std;
+
+GPUCoalescer *
+RubyGPUCoalescerParams::create()
+{
+    return new GPUCoalescer(this);
+}
+
+HSAScope
+reqScopeToHSAScope(Request* req)
+{
+    HSAScope accessScope = HSAScope_UNSPECIFIED;
+    if (req->isScoped()) {
+        if (req->isWavefrontScope()) {
+            accessScope = HSAScope_WAVEFRONT;
+        } else if (req->isWorkgroupScope()) {
+            accessScope = HSAScope_WORKGROUP;
+        } else if (req->isDeviceScope()) {
+            accessScope = HSAScope_DEVICE;
+        } else if (req->isSystemScope()) {
+            accessScope = HSAScope_SYSTEM;
+        } else {
+            fatal("Bad scope type");
+        }
+    }
+    return accessScope;
+}
+
+HSASegment
+reqSegmentToHSASegment(Request* req)
+{
+    HSASegment accessSegment = HSASegment_GLOBAL;
+
+    if (req->isGlobalSegment()) {
+        accessSegment = HSASegment_GLOBAL;
+    } else if (req->isGroupSegment()) {
+        accessSegment = HSASegment_GROUP;
+    } else if (req->isPrivateSegment()) {
+        accessSegment = HSASegment_PRIVATE;
+    } else if (req->isKernargSegment()) {
+        accessSegment = HSASegment_KERNARG;
+    } else if (req->isReadonlySegment()) {
+        accessSegment = HSASegment_READONLY;
+    } else if (req->isSpillSegment()) {
+        accessSegment = HSASegment_SPILL;
+    } else if (req->isArgSegment()) {
+        accessSegment = HSASegment_ARG;
+    } else {
+        fatal("Bad segment type");
+    }
+
+    return accessSegment;
+}
+
+GPUCoalescer::GPUCoalescer(const Params *p)
+    : RubyPort(p), issueEvent(this), deadlockCheckEvent(this)
+{
+    m_store_waiting_on_load_cycles = 0;
+    m_store_waiting_on_store_cycles = 0;
+    m_load_waiting_on_store_cycles = 0;
+    m_load_waiting_on_load_cycles = 0;
+
+    m_outstanding_count = 0;
+
+    m_max_outstanding_requests = 0;
+    m_deadlock_threshold = 0;
+    m_instCache_ptr = nullptr;
+    m_dataCache_ptr = nullptr;
+
+    m_instCache_ptr = p->icache;
+    m_dataCache_ptr = p->dcache;
+    m_max_outstanding_requests = p->max_outstanding_requests;
+    m_deadlock_threshold = p->deadlock_threshold;
+
+    assert(m_max_outstanding_requests > 0);
+    assert(m_deadlock_threshold > 0);
+    assert(m_instCache_ptr);
+    assert(m_dataCache_ptr);
+
+    m_data_cache_hit_latency = p->dcache_hit_latency;
+
+    m_usingNetworkTester = p->using_network_tester;
+    assumingRfOCoherence = p->assume_rfo;
+}
+
+GPUCoalescer::~GPUCoalescer()
+{
+}
+
+void
+GPUCoalescer::wakeup()
+{
+    // Check for deadlock of any of the requests
+    Cycles current_time = curCycle();
+
+    // Check across all outstanding requests
+    int total_outstanding = 0;
+
+    RequestTable::iterator read = m_readRequestTable.begin();
+    RequestTable::iterator read_end = m_readRequestTable.end();
+    for (; read != read_end; ++read) {
+        GPUCoalescerRequest* request = read->second;
+        if (current_time - request->issue_time < m_deadlock_threshold)
+            continue;
+
+        panic("Possible Deadlock detected. Aborting!\n"
+             "version: %d request.paddr: 0x%x m_readRequestTable: %d "
+             "current time: %u issue_time: %d difference: %d\n", m_version,
+              request->pkt->getAddr(), m_readRequestTable.size(),
+              current_time * clockPeriod(), request->issue_time * clockPeriod(),
+              (current_time - request->issue_time)*clockPeriod());
+    }
+
+    RequestTable::iterator write = m_writeRequestTable.begin();
+    RequestTable::iterator write_end = m_writeRequestTable.end();
+    for (; write != write_end; ++write) {
+        GPUCoalescerRequest* request = write->second;
+        if (current_time - request->issue_time < m_deadlock_threshold)
+            continue;
+
+        panic("Possible Deadlock detected. Aborting!\n"
+             "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
+             "current time: %u issue_time: %d difference: %d\n", m_version,
+              request->pkt->getAddr(), m_writeRequestTable.size(),
+              current_time * clockPeriod(), request->issue_time * clockPeriod(),
+              (current_time - request->issue_time) * clockPeriod());
+    }
+
+    total_outstanding += m_writeRequestTable.size();
+    total_outstanding += m_readRequestTable.size();
+
+    assert(m_outstanding_count == total_outstanding);
+
+    if (m_outstanding_count > 0) {
+        // If there are still outstanding requests, keep checking
+        schedule(deadlockCheckEvent,
+                 m_deadlock_threshold * clockPeriod() +
+                 curTick());
+    }
+}
+
+void
+GPUCoalescer::resetStats()
+{
+    m_latencyHist.reset();
+    m_missLatencyHist.reset();
+    for (int i = 0; i < RubyRequestType_NUM; i++) {
+        m_typeLatencyHist[i]->reset();
+        m_missTypeLatencyHist[i]->reset();
+        for (int j = 0; j < MachineType_NUM; j++) {
+            m_missTypeMachLatencyHist[i][j]->reset();
+        }
+    }
+
+    for (int i = 0; i < MachineType_NUM; i++) {
+        m_missMachLatencyHist[i]->reset();
+
+        m_IssueToInitialDelayHist[i]->reset();
+        m_InitialToForwardDelayHist[i]->reset();
+        m_ForwardToFirstResponseDelayHist[i]->reset();
+        m_FirstResponseToCompletionDelayHist[i]->reset();
+    }
+}
+
+void
+GPUCoalescer::printProgress(ostream& out) const
+{
+}
+
+RequestStatus
+GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
+{
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+
+    if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
+        return RequestStatus_BufferFull;
+    }
+
+    if(m_controller->isBlocked(line_addr) &&
+       request_type != RubyRequestType_Locked_RMW_Write) {
+        return RequestStatus_Aliased;
+    }
+
+    if ((request_type == RubyRequestType_ST) ||
+        (request_type == RubyRequestType_ATOMIC) ||
+        (request_type == RubyRequestType_ATOMIC_RETURN) ||
+        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+        (request_type == RubyRequestType_RMW_Read) ||
+        (request_type == RubyRequestType_RMW_Write) ||
+        (request_type == RubyRequestType_Load_Linked) ||
+        (request_type == RubyRequestType_Store_Conditional) ||
+        (request_type == RubyRequestType_Locked_RMW_Read) ||
+        (request_type == RubyRequestType_Locked_RMW_Write) ||
+        (request_type == RubyRequestType_FLUSH)) {
+
+        // Check if there is any outstanding read request for the same
+        // cache line.
+        if (m_readRequestTable.count(line_addr) > 0) {
+            m_store_waiting_on_load_cycles++;
+            return RequestStatus_Aliased;
+        }
+
+        if (m_writeRequestTable.count(line_addr) > 0) {
+          // There is an outstanding write request for the cache line
+          m_store_waiting_on_store_cycles++;
+          return RequestStatus_Aliased;
+        }
+    } else {
+        // Check if there is any outstanding write request for the same
+        // cache line.
+        if (m_writeRequestTable.count(line_addr) > 0) {
+            m_load_waiting_on_store_cycles++;
+            return RequestStatus_Aliased;
+        }
+
+        if (m_readRequestTable.count(line_addr) > 0) {
+            // There is an outstanding read request for the cache line
+            m_load_waiting_on_load_cycles++;
+            return RequestStatus_Aliased;
+        }
+    }
+
+    return RequestStatus_Ready;
+
+}
+
+
+
+// sets the kernelEndList
+void
+GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
+{
+    // Don't know if this will happen or is possible
+    // but I just want to be careful and not have it become
+    // simulator hang in the future
+    DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
+    assert(kernelEndList.count(wavefront_id) == 0);
+
+    kernelEndList[wavefront_id] = pkt;
+    DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
+            kernelEndList.size());
+}
+
+
+// Insert the request on the correct request table.  Return true if
+// the entry was already present.
+bool
+GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
+{
+    assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
+           pkt->req->isLockedRMW() ||
+           !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
+
+    int total_outstanding M5_VAR_USED =
+        m_writeRequestTable.size() + m_readRequestTable.size();
+
+    assert(m_outstanding_count == total_outstanding);
+
+    // See if we should schedule a deadlock check
+    if (deadlockCheckEvent.scheduled() == false) {
+        schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
+    }
+
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+    if ((request_type == RubyRequestType_ST) ||
+        (request_type == RubyRequestType_ATOMIC) ||
+        (request_type == RubyRequestType_ATOMIC_RETURN) ||
+        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+        (request_type == RubyRequestType_RMW_Read) ||
+        (request_type == RubyRequestType_RMW_Write) ||
+        (request_type == RubyRequestType_Load_Linked) ||
+        (request_type == RubyRequestType_Store_Conditional) ||
+        (request_type == RubyRequestType_Locked_RMW_Read) ||
+        (request_type == RubyRequestType_Locked_RMW_Write) ||
+        (request_type == RubyRequestType_FLUSH)) {
+
+        pair<RequestTable::iterator, bool> r =
+          m_writeRequestTable.insert(RequestTable::value_type(line_addr,
+                                       (GPUCoalescerRequest*) NULL));
+        if (r.second) {
+            RequestTable::iterator i = r.first;
+            i->second = new GPUCoalescerRequest(pkt, request_type,
+                                                curCycle());
+            DPRINTF(GPUCoalescer,
+                    "Inserting write request for paddr %#x for type %d\n",
+                    pkt->req->getPaddr(), i->second->m_type);
+            m_outstanding_count++;
+        } else {
+            return true;
+        }
+    } else {
+        pair<RequestTable::iterator, bool> r =
+            m_readRequestTable.insert(RequestTable::value_type(line_addr,
+                                        (GPUCoalescerRequest*) NULL));
+
+        if (r.second) {
+            RequestTable::iterator i = r.first;
+            i->second = new GPUCoalescerRequest(pkt, request_type,
+                                             curCycle());
+            DPRINTF(GPUCoalescer,
+                    "Inserting read request for paddr %#x for type %d\n",
+                    pkt->req->getPaddr(), i->second->m_type);
+            m_outstanding_count++;
+        } else {
+            return true;
+        }
+    }
+
+    m_outstandReqHist.sample(m_outstanding_count);
+
+    total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
+    assert(m_outstanding_count == total_outstanding);
+
+    return false;
+}
+
+void
+GPUCoalescer::markRemoved()
+{
+    m_outstanding_count--;
+    assert(m_outstanding_count ==
+           m_writeRequestTable.size() + m_readRequestTable.size());
+}
+
+void
+GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
+{
+    assert(m_outstanding_count ==
+           m_writeRequestTable.size() + m_readRequestTable.size());
+
+    Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
+    if ((srequest->m_type == RubyRequestType_ST) ||
+        (srequest->m_type == RubyRequestType_RMW_Read) ||
+        (srequest->m_type == RubyRequestType_RMW_Write) ||
+        (srequest->m_type == RubyRequestType_Load_Linked) ||
+        (srequest->m_type == RubyRequestType_Store_Conditional) ||
+        (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
+        (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
+        m_writeRequestTable.erase(line_addr);
+    } else {
+        m_readRequestTable.erase(line_addr);
+    }
+
+    markRemoved();
+}
+
+bool
+GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
+{
+    //
+    // The success flag indicates whether the LLSC operation was successful.
+    // LL ops will always succeed, but SC may fail if the cache line is no
+    // longer locked.
+    //
+    bool success = true;
+    if (request->m_type == RubyRequestType_Store_Conditional) {
+        if (!m_dataCache_ptr->isLocked(address, m_version)) {
+            //
+            // For failed SC requests, indicate the failure to the cpu by
+            // setting the extra data to zero.
+            //
+            request->pkt->req->setExtraData(0);
+            success = false;
+        } else {
+            //
+            // For successful SC requests, indicate the success to the cpu by
+            // setting the extra data to one.
+            //
+            request->pkt->req->setExtraData(1);
+        }
+        //
+        // Independent of success, all SC operations must clear the lock
+        //
+        m_dataCache_ptr->clearLocked(address);
+    } else if (request->m_type == RubyRequestType_Load_Linked) {
+        //
+        // Note: To fully follow Alpha LLSC semantics, should the LL clear any
+        // previously locked cache lines?
+        //
+        m_dataCache_ptr->setLocked(address, m_version);
+    } else if ((m_dataCache_ptr->isTagPresent(address)) &&
+               (m_dataCache_ptr->isLocked(address, m_version))) {
+        //
+        // Normal writes should clear the locked address
+        //
+        m_dataCache_ptr->clearLocked(address);
+    }
+    return success;
+}
+
+void
+GPUCoalescer::writeCallback(Addr address, DataBlock& data)
+{
+    writeCallback(address, MachineType_NULL, data);
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+                         MachineType mach,
+                         DataBlock& data)
+{
+    writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+                         MachineType mach,
+                         DataBlock& data,
+                         Cycles initialRequestTime,
+                         Cycles forwardRequestTime,
+                         Cycles firstResponseTime)
+{
+    writeCallback(address, mach, data,
+                  initialRequestTime, forwardRequestTime, firstResponseTime,
+                  false);
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+                         MachineType mach,
+                         DataBlock& data,
+                         Cycles initialRequestTime,
+                         Cycles forwardRequestTime,
+                         Cycles firstResponseTime,
+                         bool isRegion)
+{
+    assert(address == makeLineAddress(address));
+
+    DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
+    assert(m_writeRequestTable.count(makeLineAddress(address)));
+
+    RequestTable::iterator i = m_writeRequestTable.find(address);
+    assert(i != m_writeRequestTable.end());
+    GPUCoalescerRequest* request = i->second;
+
+    m_writeRequestTable.erase(i);
+    markRemoved();
+
+    assert((request->m_type == RubyRequestType_ST) ||
+           (request->m_type == RubyRequestType_ATOMIC) ||
+           (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
+           (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+           (request->m_type == RubyRequestType_RMW_Read) ||
+           (request->m_type == RubyRequestType_RMW_Write) ||
+           (request->m_type == RubyRequestType_Load_Linked) ||
+           (request->m_type == RubyRequestType_Store_Conditional) ||
+           (request->m_type == RubyRequestType_Locked_RMW_Read) ||
+           (request->m_type == RubyRequestType_Locked_RMW_Write) ||
+           (request->m_type == RubyRequestType_FLUSH));
+
+
+    //
+    // For Alpha, properly handle LL, SC, and write requests with respect to
+    // locked cache blocks.
+    //
+    // Not valid for Network_test protocl
+    //
+    bool success = true;
+    if(!m_usingNetworkTester)
+        success = handleLlsc(address, request);
+
+    if (request->m_type == RubyRequestType_Locked_RMW_Read) {
+        m_controller->blockOnQueue(address, m_mandatory_q_ptr);
+    } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
+        m_controller->unblock(address);
+    }
+
+    hitCallback(request, mach, data, success,
+                request->issue_time, forwardRequestTime, firstResponseTime,
+                isRegion);
+}
+
+void
+GPUCoalescer::readCallback(Addr address, DataBlock& data)
+{
+    readCallback(address, MachineType_NULL, data);
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+                        MachineType mach,
+                        DataBlock& data)
+{
+    readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+                        MachineType mach,
+                        DataBlock& data,
+                        Cycles initialRequestTime,
+                        Cycles forwardRequestTime,
+                        Cycles firstResponseTime)
+{
+
+    readCallback(address, mach, data,
+                 initialRequestTime, forwardRequestTime, firstResponseTime,
+                 false);
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+                        MachineType mach,
+                        DataBlock& data,
+                        Cycles initialRequestTime,
+                        Cycles forwardRequestTime,
+                        Cycles firstResponseTime,
+                        bool isRegion)
+{
+    assert(address == makeLineAddress(address));
+    assert(m_readRequestTable.count(makeLineAddress(address)));
+
+    DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
+    RequestTable::iterator i = m_readRequestTable.find(address);
+    assert(i != m_readRequestTable.end());
+    GPUCoalescerRequest* request = i->second;
+
+    m_readRequestTable.erase(i);
+    markRemoved();
+
+    assert((request->m_type == RubyRequestType_LD) ||
+           (request->m_type == RubyRequestType_IFETCH));
+
+    hitCallback(request, mach, data, true,
+                request->issue_time, forwardRequestTime, firstResponseTime,
+                isRegion);
+}
+
+void
+GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
+                       MachineType mach,
+                       DataBlock& data,
+                       bool success,
+                       Cycles initialRequestTime,
+                       Cycles forwardRequestTime,
+                       Cycles firstResponseTime,
+                       bool isRegion)
+{
+    PacketPtr pkt = srequest->pkt;
+    Addr request_address = pkt->getAddr();
+    Addr request_line_address = makeLineAddress(request_address);
+
+    RubyRequestType type = srequest->m_type;
+
+    // Set this cache entry to the most recently used
+    if (type == RubyRequestType_IFETCH) {
+        if (m_instCache_ptr->isTagPresent(request_line_address))
+            m_instCache_ptr->setMRU(request_line_address);
+    } else {
+        if (m_dataCache_ptr->isTagPresent(request_line_address))
+            m_dataCache_ptr->setMRU(request_line_address);
+    }
+
+    recordMissLatency(srequest, mach,
+                      initialRequestTime,
+                      forwardRequestTime,
+                      firstResponseTime,
+                      success, isRegion);
+    // update the data
+    //
+    // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
+    int len = reqCoalescer[request_line_address].size();
+    std::vector<PacketPtr> mylist;
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = reqCoalescer[request_line_address][i].first;
+        assert(type ==
+               reqCoalescer[request_line_address][i].second[PrimaryType]);
+        request_address = pkt->getAddr();
+        request_line_address = makeLineAddress(pkt->getAddr());
+        if (pkt->getPtr<uint8_t>()) {
+            if ((type == RubyRequestType_LD) ||
+                (type == RubyRequestType_ATOMIC) ||
+                (type == RubyRequestType_ATOMIC_RETURN) ||
+                (type == RubyRequestType_IFETCH) ||
+                (type == RubyRequestType_RMW_Read) ||
+                (type == RubyRequestType_Locked_RMW_Read) ||
+                (type == RubyRequestType_Load_Linked)) {
+                memcpy(pkt->getPtr<uint8_t>(),
+                       data.getData(getOffset(request_address),
+                                    pkt->getSize()),
+                       pkt->getSize());
+            } else {
+                data.setData(pkt->getPtr<uint8_t>(),
+                             getOffset(request_address), pkt->getSize());
+            }
+        } else {
+            DPRINTF(MemoryAccess,
+                    "WARNING.  Data not transfered from Ruby to M5 for type " \
+                    "%s\n",
+                    RubyRequestType_to_string(type));
+        }
+
+        // If using the RubyTester, update the RubyTester sender state's
+        // subBlock with the recieved data.  The tester will later access
+        // this state.
+        // Note: RubyPort will access it's sender state before the
+        // RubyTester.
+        if (m_usingRubyTester) {
+            RubyPort::SenderState *requestSenderState =
+                safe_cast<RubyPort::SenderState*>(pkt->senderState);
+            RubyTester::SenderState* testerSenderState =
+                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
+            testerSenderState->subBlock.mergeFrom(data);
+        }
+
+        mylist.push_back(pkt);
+    }
+    delete srequest;
+    reqCoalescer.erase(request_line_address);
+    assert(!reqCoalescer.count(request_line_address));
+
+
+
+    completeHitCallback(mylist, len);
+}
+
+bool
+GPUCoalescer::empty() const
+{
+    return m_writeRequestTable.empty() && m_readRequestTable.empty();
+}
+
+// Analyzes the packet to see if this request can be coalesced.
+// If request can be coalesced, this request is added to the reqCoalescer table
+// and makeRequest returns RequestStatus_Issued;
+// If this is the first request to a cacheline, request is added to both
+// newRequests queue and to the reqCoalescer table; makeRequest
+// returns RequestStatus_Issued.
+// If there is a pending request to this cacheline and this request
+// can't be coalesced, RequestStatus_Aliased is returned and
+// the packet needs to be reissued.
+RequestStatus
+GPUCoalescer::makeRequest(PacketPtr pkt)
+{
+    // Check for GPU Barrier Kernel End or Kernel Begin
+    // Leave these to be handled by the child class
+    // Kernel End/Barrier = isFlush + isRelease
+    // Kernel Begin = isFlush + isAcquire
+    if (pkt->req->isKernel()) {
+        if (pkt->req->isAcquire()){
+            // This is a Kernel Begin leave handling to
+            // virtual xCoalescer::makeRequest
+            return RequestStatus_Issued;
+        }else if(pkt->req->isRelease()) {
+            // This is a Kernel End leave handling to
+            // virtual xCoalescer::makeRequest
+            // If we are here then we didn't call
+            // a virtual version of this function
+            // so we will also schedule the callback
+            int wf_id = 0;
+            if (pkt->req->hasContextId()) {
+                wf_id = pkt->req->contextId();
+            }
+            insertKernel(wf_id, pkt);
+            newKernelEnds.push_back(wf_id);
+            if (!issueEvent.scheduled()) {
+                schedule(issueEvent, curTick());
+            }
+            return RequestStatus_Issued;
+        }
+    }
+
+    // If number of outstanding requests greater than the max allowed,
+    // return RequestStatus_BufferFull. This logic can be extended to
+    // support proper backpressure.
+    if (m_outstanding_count >= m_max_outstanding_requests) {
+        return RequestStatus_BufferFull;
+    }
+
+    RubyRequestType primary_type = RubyRequestType_NULL;
+    RubyRequestType secondary_type = RubyRequestType_NULL;
+
+    if (pkt->isLLSC()) {
+        //
+        // Alpha LL/SC instructions need to be handled carefully by the cache
+        // coherence protocol to ensure they follow the proper semantics. In
+        // particular, by identifying the operations as atomic, the protocol
+        // should understand that migratory sharing optimizations should not
+        // be performed (i.e. a load between the LL and SC should not steal
+        // away exclusive permission).
+        //
+        if (pkt->isWrite()) {
+            primary_type = RubyRequestType_Store_Conditional;
+        } else {
+            assert(pkt->isRead());
+            primary_type = RubyRequestType_Load_Linked;
+        }
+        secondary_type = RubyRequestType_ATOMIC;
+    } else if (pkt->req->isLockedRMW()) {
+        //
+        // x86 locked instructions are translated to store cache coherence
+        // requests because these requests should always be treated as read
+        // exclusive operations and should leverage any migratory sharing
+        // optimization built into the protocol.
+        //
+        if (pkt->isWrite()) {
+            primary_type = RubyRequestType_Locked_RMW_Write;
+        } else {
+            assert(pkt->isRead());
+            primary_type = RubyRequestType_Locked_RMW_Read;
+        }
+        secondary_type = RubyRequestType_ST;
+    } else if (pkt->isAtomicOp()) {
+        //
+        // GPU Atomic Operation
+        //
+        primary_type = RubyRequestType_ATOMIC;
+        secondary_type = RubyRequestType_ATOMIC;
+    } else {
+        if (pkt->isRead()) {
+            if (pkt->req->isInstFetch()) {
+                primary_type = secondary_type = RubyRequestType_IFETCH;
+            } else {
+#if THE_ISA == X86_ISA
+                uint32_t flags = pkt->req->getFlags();
+                bool storeCheck = flags &
+                        (TheISA::StoreCheck << TheISA::FlagShift);
+#else
+                bool storeCheck = false;
+#endif // X86_ISA
+                if (storeCheck) {
+                    primary_type = RubyRequestType_RMW_Read;
+                    secondary_type = RubyRequestType_ST;
+                } else {
+                    primary_type = secondary_type = RubyRequestType_LD;
+                }
+            }
+        } else if (pkt->isWrite()) {
+            //
+            // Note: M5 packets do not differentiate ST from RMW_Write
+            //
+            primary_type = secondary_type = RubyRequestType_ST;
+        } else if (pkt->isFlush()) {
+            primary_type = secondary_type = RubyRequestType_FLUSH;
+        } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
+            if (assumingRfOCoherence) {
+                // If we reached here, this request must be a memFence
+                // and the protocol implements RfO, the coalescer can
+                // assume sequentially consistency and schedule the callback
+                // immediately.
+                // Currently the code implements fence callbacks
+                // by reusing the mechanism for kernel completions.
+                // This should be fixed.
+                int wf_id = 0;
+                if (pkt->req->hasContextId()) {
+                    wf_id = pkt->req->contextId();
+                }
+                insertKernel(wf_id, pkt);
+                newKernelEnds.push_back(wf_id);
+                if (!issueEvent.scheduled()) {
+                    schedule(issueEvent, curTick());
+                }
+                return RequestStatus_Issued;
+            } else {
+                // If not RfO, return issued here and let the child coalescer
+                // take care of it.
+                return RequestStatus_Issued;
+            }
+        } else {
+            panic("Unsupported ruby packet type\n");
+        }
+    }
+
+    // Check if there is any pending request to this cache line from
+    // previous cycles.
+    // If there is a pending request, return aliased. Since coalescing
+    // across time is not permitted, aliased requests are not coalesced.
+    // If a request for this address has already been issued, we must block
+    RequestStatus status = getRequestStatus(pkt, primary_type);
+    if (status != RequestStatus_Ready)
+        return status;
+
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+
+    // Check if this request can be coalesced with previous
+    // requests from this cycle.
+    if (!reqCoalescer.count(line_addr)) {
+        // This is the first access to this cache line.
+        // A new request to the memory subsystem has to be
+        // made in the next cycle for this cache line, so
+        // add this line addr to the "newRequests" queue
+        newRequests.push_back(line_addr);
+
+    // There was a request to this cache line in this cycle,
+    // let us see if we can coalesce this request with the previous
+    // requests from this cycle
+    } else if (primary_type !=
+               reqCoalescer[line_addr][0].second[PrimaryType]) {
+        // can't coalesce loads, stores and atomics!
+        return RequestStatus_Aliased;
+    } else if (pkt->req->isLockedRMW() ||
+               reqCoalescer[line_addr][0].first->req->isLockedRMW()) {
+        // can't coalesce locked accesses, but can coalesce atomics!
+        return RequestStatus_Aliased;
+    } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
+               pkt->req->contextId() !=
+               reqCoalescer[line_addr][0].first->req->contextId()) {
+        // can't coalesce releases from different wavefronts
+        return RequestStatus_Aliased;
+    }
+
+    // in addition to the packet, we need to save both request types
+    reqCoalescer[line_addr].push_back(
+            RequestDesc(pkt, std::vector<RubyRequestType>()) );
+    reqCoalescer[line_addr].back().second.push_back(primary_type);
+    reqCoalescer[line_addr].back().second.push_back(secondary_type);
+    if (!issueEvent.scheduled())
+        schedule(issueEvent, curTick());
+    // TODO: issue hardware prefetches here
+    return RequestStatus_Issued;
+}
+
+void
+GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
+{
+
+    int proc_id = -1;
+    if (pkt != NULL && pkt->req->hasContextId()) {
+        proc_id = pkt->req->contextId();
+    }
+
+    // If valid, copy the pc to the ruby request
+    Addr pc = 0;
+    if (pkt->req->hasPC()) {
+        pc = pkt->req->getPC();
+    }
+
+    // At the moment setting scopes only counts
+    // for GPU spill space accesses
+    // which is pkt->req->isStack()
+    // this scope is REPLACE since it
+    // does not need to be flushed at the end
+    // of a kernel Private and local may need
+    // to be visible at the end of the kernel
+    HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
+    HSAScope accessScope = reqScopeToHSAScope(pkt->req);
+
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+
+    // Creating WriteMask that records written bytes
+    // and atomic operations. This enables partial writes
+    // and partial reads of those writes
+    DataBlock dataBlock;
+    dataBlock.clear();
+    uint32_t blockSize = RubySystem::getBlockSizeBytes();
+    std::vector<bool> accessMask(blockSize,false);
+    std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
+    uint32_t tableSize = reqCoalescer[line_addr].size();
+    for (int i = 0; i < tableSize; i++) {
+        PacketPtr tmpPkt = reqCoalescer[line_addr][i].first;
+        uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
+        uint32_t tmpSize = tmpPkt->getSize();
+        if (tmpPkt->isAtomicOp()) {
+            std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
+                                                        tmpPkt->getAtomicOp());
+            atomicOps.push_back(tmpAtomicOp);
+        } else if(tmpPkt->isWrite()) {
+            dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
+                              tmpOffset, tmpSize);
+        }
+        for (int j = 0; j < tmpSize; j++) {
+            accessMask[tmpOffset + j] = true;
+        }
+    }
+    std::shared_ptr<RubyRequest> msg;
+    if (pkt->isAtomicOp()) {
+        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+                              pkt->getPtr<uint8_t>(),
+                              pkt->getSize(), pc, secondary_type,
+                              RubyAccessMode_Supervisor, pkt,
+                              PrefetchBit_No, proc_id, 100,
+                              blockSize, accessMask,
+                              dataBlock, atomicOps,
+                              accessScope, accessSegment);
+    } else {
+        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+                              pkt->getPtr<uint8_t>(),
+                              pkt->getSize(), pc, secondary_type,
+                              RubyAccessMode_Supervisor, pkt,
+                              PrefetchBit_No, proc_id, 100,
+                              blockSize, accessMask,
+                              dataBlock,
+                              accessScope, accessSegment);
+    }
+    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
+             curTick(), m_version, "Coal", "Begin", "", "",
+             printAddress(msg->getPhysicalAddress()),
+             RubyRequestType_to_string(secondary_type));
+
+    fatal_if(secondary_type == RubyRequestType_IFETCH,
+             "there should not be any I-Fetch requests in the GPU Coalescer");
+
+    // Send the message to the cache controller
+    fatal_if(m_data_cache_hit_latency == 0,
+             "should not have a latency of zero");
+
+    assert(m_mandatory_q_ptr);
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+}
+
+template <class KEY, class VALUE>
+std::ostream &
+operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
+{
+    out << "[";
+    for (auto i = map.begin(); i != map.end(); ++i)
+        out << " " << i->first << "=" << i->second;
+    out << " ]";
+
+    return out;
+}
+
+void
+GPUCoalescer::print(ostream& out) const
+{
+    out << "[GPUCoalescer: " << m_version
+        << ", outstanding requests: " << m_outstanding_count
+        << ", read request table: " << m_readRequestTable
+        << ", write request table: " << m_writeRequestTable
+        << "]";
+}
+
+// this can be called from setState whenever coherence permissions are
+// upgraded when invoked, coherence violations will be checked for the
+// given block
+void
+GPUCoalescer::checkCoherence(Addr addr)
+{
+#ifdef CHECK_COHERENCE
+    m_ruby_system->checkGlobalCoherenceInvariant(addr);
+#endif
+}
+
+void
+GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
+    DPRINTF(RubyStats, "Recorded statistic: %s\n",
+            SequencerRequestType_to_string(requestType));
+}
+
+GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq)
+    : Event(Progress_Event_Pri), seq(_seq)
+{
+}
+
+
+void
+GPUCoalescer::completeIssue()
+{
+    // newRequests has the cacheline addresses of all the
+    // requests which need to be issued to the memory subsystem
+    // in this cycle
+    int len = newRequests.size();
+    DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
+    for (int i = 0; i < len; ++i) {
+        // Get the requests from reqCoalescer table. Get only the
+        // first request for each cacheline, the remaining requests
+        // can be coalesced with the first request. So, only
+        // one request is issued per cacheline.
+        RequestDesc info = reqCoalescer[newRequests[i]][0];
+        PacketPtr pkt = info.first;
+        DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
+                i, pkt->req->getPaddr());
+        // Insert this request to the read/writeRequestTables. These tables
+        // are used to track aliased requests in makeRequest subroutine
+        bool found = insertRequest(pkt, info.second[PrimaryType]);
+
+        if (found) {
+            panic("GPUCoalescer::makeRequest should never be called if the "
+                  "request is already outstanding\n");
+        }
+
+        // Issue request to ruby subsystem
+        issueRequest(pkt, info.second[SecondaryType]);
+    }
+    newRequests.clear();
+
+    // have Kernel End releases been issued this cycle
+    len = newKernelEnds.size();
+    for (int i = 0; i < len; i++) {
+        kernelCallback(newKernelEnds[i]);
+    }
+    newKernelEnds.clear();
+}
+
+void
+GPUCoalescer::IssueEvent::process()
+{
+    seq->completeIssue();
+}
+
+const char *
+GPUCoalescer::IssueEvent::description() const
+{
+    return "Issue coalesced request";
+}
+
+void
+GPUCoalescer::evictionCallback(Addr address)
+{
+    ruby_eviction_callback(address);
+}
+
+void
+GPUCoalescer::kernelCallback(int wavefront_id)
+{
+    assert(kernelEndList.count(wavefront_id));
+
+    ruby_hit_callback(kernelEndList[wavefront_id]);
+
+    kernelEndList.erase(wavefront_id);
+}
+
+void
+GPUCoalescer::atomicCallback(Addr address,
+                             MachineType mach,
+                             const DataBlock& data)
+{
+    assert(address == makeLineAddress(address));
+
+    DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
+    assert(m_writeRequestTable.count(makeLineAddress(address)));
+
+    RequestTable::iterator i = m_writeRequestTable.find(address);
+    assert(i != m_writeRequestTable.end());
+    GPUCoalescerRequest* srequest = i->second;
+
+    m_writeRequestTable.erase(i);
+    markRemoved();
+
+    assert((srequest->m_type == RubyRequestType_ATOMIC) ||
+           (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
+           (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
+
+
+    // Atomics don't write to cache, so there is no MRU update...
+
+    recordMissLatency(srequest, mach,
+                      srequest->issue_time, Cycles(0), Cycles(0), true, false);
+
+    PacketPtr pkt = srequest->pkt;
+    Addr request_address = pkt->getAddr();
+    Addr request_line_address = makeLineAddress(pkt->getAddr());
+
+    int len = reqCoalescer[request_line_address].size();
+    std::vector<PacketPtr> mylist;
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = reqCoalescer[request_line_address][i].first;
+        assert(srequest->m_type ==
+               reqCoalescer[request_line_address][i].second[PrimaryType]);
+        request_address = (pkt->getAddr());
+        request_line_address = makeLineAddress(request_address);
+        if (pkt->getPtr<uint8_t>() &&
+            srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
+            /* atomics are done in memory, and return the data *before* the atomic op... */
+            memcpy(pkt->getPtr<uint8_t>(),
+                   data.getData(getOffset(request_address),
+                                pkt->getSize()),
+                   pkt->getSize());
+        } else {
+            DPRINTF(MemoryAccess,
+                    "WARNING.  Data not transfered from Ruby to M5 for type " \
+                    "%s\n",
+                    RubyRequestType_to_string(srequest->m_type));
+        }
+
+        // If using the RubyTester, update the RubyTester sender state's
+        // subBlock with the recieved data.  The tester will later access
+        // this state.
+        // Note: RubyPort will access it's sender state before the
+        // RubyTester.
+        if (m_usingRubyTester) {
+            RubyPort::SenderState *requestSenderState =
+                safe_cast<RubyPort::SenderState*>(pkt->senderState);
+            RubyTester::SenderState* testerSenderState =
+                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
+            testerSenderState->subBlock.mergeFrom(data);
+        }
+
+        mylist.push_back(pkt);
+    }
+    delete srequest;
+    reqCoalescer.erase(request_line_address);
+    assert(!reqCoalescer.count(request_line_address));
+
+    completeHitCallback(mylist, len);
+}
+
+void
+GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
+{
+    if(myMachID == senderMachID) {
+        CP_TCPLdHits++;
+    } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) {
+        CP_TCPLdTransfers++;
+    } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) {
+        CP_TCCLdHits++;
+    } else {
+        CP_LdMiss++;
+    }
+}
+
+void
+GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
+{
+    if(myMachID == senderMachID) {
+        CP_TCPStHits++;
+    } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) {
+        CP_TCPStTransfers++;
+    } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) {
+        CP_TCCStHits++;
+    } else {
+        CP_StMiss++;
+    }
+}
+
+void
+GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
+{
+    for (int i = 0; i < len; ++i) {
+        RubyPort::SenderState *ss =
+            safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
+        MemSlavePort *port = ss->port;
+        assert(port != NULL);
+
+        mylist[i]->senderState = ss->predecessor;
+        delete ss;
+        port->hitCallback(mylist[i]);
+        trySendRetries();
+    }
+
+    testDrainComplete();
+}
+
+PacketPtr
+GPUCoalescer::mapAddrToPkt(Addr address)
+{
+    RequestTable::iterator i = m_readRequestTable.find(address);
+    assert(i != m_readRequestTable.end());
+    GPUCoalescerRequest* request = i->second;
+    return request->pkt;
+}
+
+void
+GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
+                                MachineType mach,
+                                Cycles initialRequestTime,
+                                Cycles forwardRequestTime,
+                                Cycles firstResponseTime,
+                                bool success, bool isRegion)
+{
+    RubyRequestType type = srequest->m_type;
+    Cycles issued_time = srequest->issue_time;
+    Cycles completion_time = curCycle();
+    assert(completion_time >= issued_time);
+    Cycles total_lat = completion_time - issued_time;
+
+    // cache stats (valid for RfO protocol only)
+    if (mach == MachineType_TCP) {
+        if (type == RubyRequestType_LD) {
+            GPU_TCPLdHits++;
+        } else {
+            GPU_TCPStHits++;
+        }
+    } else if (mach == MachineType_L1Cache_wCC) {
+        if (type == RubyRequestType_LD) {
+            GPU_TCPLdTransfers++;
+        } else {
+            GPU_TCPStTransfers++;
+        }
+    } else if (mach == MachineType_TCC) {
+        if (type == RubyRequestType_LD) {
+            GPU_TCCLdHits++;
+        } else {
+            GPU_TCCStHits++;
+        }
+    } else  {
+        if (type == RubyRequestType_LD) {
+            GPU_LdMiss++;
+        } else {
+            GPU_StMiss++;
+        }
+    }
+
+    // Profile all access latency, even zero latency accesses
+    m_latencyHist.sample(total_lat);
+    m_typeLatencyHist[type]->sample(total_lat);
+
+    // Profile the miss latency for all non-zero demand misses
+    if (total_lat != Cycles(0)) {
+        m_missLatencyHist.sample(total_lat);
+        m_missTypeLatencyHist[type]->sample(total_lat);
+
+        if (mach != MachineType_NUM) {
+            m_missMachLatencyHist[mach]->sample(total_lat);
+            m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
+
+            if ((issued_time <= initialRequestTime) &&
+                (initialRequestTime <= forwardRequestTime) &&
+                (forwardRequestTime <= firstResponseTime) &&
+                (firstResponseTime <= completion_time)) {
+
+                m_IssueToInitialDelayHist[mach]->sample(
+                    initialRequestTime - issued_time);
+                m_InitialToForwardDelayHist[mach]->sample(
+                    forwardRequestTime - initialRequestTime);
+                m_ForwardToFirstResponseDelayHist[mach]->sample(
+                    firstResponseTime - forwardRequestTime);
+                m_FirstResponseToCompletionDelayHist[mach]->sample(
+                    completion_time - firstResponseTime);
+            }
+        }
+
+    }
+
+    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
+             curTick(), m_version, "Coal",
+             success ? "Done" : "SC_Failed", "", "",
+             printAddress(srequest->pkt->getAddr()), total_lat);
+}
+
+void
+GPUCoalescer::regStats()
+{
+    // These statistical variables are not for display.
+    // The profiler will collate these across different
+    // coalescers and display those collated statistics.
+    m_outstandReqHist.init(10);
+    m_latencyHist.init(10);
+    m_missLatencyHist.init(10);
+
+    for (int i = 0; i < RubyRequestType_NUM; i++) {
+        m_typeLatencyHist.push_back(new Stats::Histogram());
+        m_typeLatencyHist[i]->init(10);
+
+        m_missTypeLatencyHist.push_back(new Stats::Histogram());
+        m_missTypeLatencyHist[i]->init(10);
+    }
+
+    for (int i = 0; i < MachineType_NUM; i++) {
+        m_missMachLatencyHist.push_back(new Stats::Histogram());
+        m_missMachLatencyHist[i]->init(10);
+
+        m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
+        m_IssueToInitialDelayHist[i]->init(10);
+
+        m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
+        m_InitialToForwardDelayHist[i]->init(10);
+
+        m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
+        m_ForwardToFirstResponseDelayHist[i]->init(10);
+
+        m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
+        m_FirstResponseToCompletionDelayHist[i]->init(10);
+    }
+
+    for (int i = 0; i < RubyRequestType_NUM; i++) {
+        m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
+
+        for (int j = 0; j < MachineType_NUM; j++) {
+            m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
+            m_missTypeMachLatencyHist[i][j]->init(10);
+        }
+    }
+
+    // GPU cache stats
+    GPU_TCPLdHits
+        .name(name() + ".gpu_tcp_ld_hits")
+        .desc("loads that hit in the TCP")
+        ;
+    GPU_TCPLdTransfers
+        .name(name() + ".gpu_tcp_ld_transfers")
+        .desc("TCP to TCP load transfers")
+        ;
+    GPU_TCCLdHits
+        .name(name() + ".gpu_tcc_ld_hits")
+        .desc("loads that hit in the TCC")
+        ;
+    GPU_LdMiss
+        .name(name() + ".gpu_ld_misses")
+        .desc("loads that miss in the GPU")
+        ;
+
+    GPU_TCPStHits
+        .name(name() + ".gpu_tcp_st_hits")
+        .desc("stores that hit in the TCP")
+        ;
+    GPU_TCPStTransfers
+        .name(name() + ".gpu_tcp_st_transfers")
+        .desc("TCP to TCP store transfers")
+        ;
+    GPU_TCCStHits
+        .name(name() + ".gpu_tcc_st_hits")
+        .desc("stores that hit in the TCC")
+        ;
+    GPU_StMiss
+        .name(name() + ".gpu_st_misses")
+        .desc("stores that miss in the GPU")
+        ;
+
+    // CP cache stats
+    CP_TCPLdHits
+        .name(name() + ".cp_tcp_ld_hits")
+        .desc("loads that hit in the TCP")
+        ;
+    CP_TCPLdTransfers
+        .name(name() + ".cp_tcp_ld_transfers")
+        .desc("TCP to TCP load transfers")
+        ;
+    CP_TCCLdHits
+        .name(name() + ".cp_tcc_ld_hits")
+        .desc("loads that hit in the TCC")
+        ;
+    CP_LdMiss
+        .name(name() + ".cp_ld_misses")
+        .desc("loads that miss in the GPU")
+        ;
+
+    CP_TCPStHits
+        .name(name() + ".cp_tcp_st_hits")
+        .desc("stores that hit in the TCP")
+        ;
+    CP_TCPStTransfers
+        .name(name() + ".cp_tcp_st_transfers")
+        .desc("TCP to TCP store transfers")
+        ;
+    CP_TCCStHits
+        .name(name() + ".cp_tcc_st_hits")
+        .desc("stores that hit in the TCC")
+        ;
+    CP_StMiss
+        .name(name() + ".cp_st_misses")
+        .desc("stores that miss in the GPU")
+        ;
+}
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
new file mode 100644
index 000000000..dbd47059c
--- /dev/null
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
+#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
+
+#include <iostream>
+#include <unordered_map>
+
+#include "base/statistics.hh"
+#include "mem/protocol/HSAScope.hh"
+#include "mem/protocol/HSASegment.hh"
+#include "mem/protocol/PrefetchBit.hh"
+#include "mem/protocol/RubyAccessMode.hh"
+#include "mem/protocol/RubyRequestType.hh"
+#include "mem/protocol/SequencerRequestType.hh"
+#include "mem/request.hh"
+#include "mem/ruby/common/Address.hh"
+#include "mem/ruby/common/Consumer.hh"
+#include "mem/ruby/system/RubyPort.hh"
+
+class DataBlock;
+class CacheMsg;
+class MachineID;
+class CacheMemory;
+
+class RubyGPUCoalescerParams;
+
+HSAScope reqScopeToHSAScope(Request* req);
+HSASegment reqSegmentToHSASegment(Request* req);
+
+struct GPUCoalescerRequest
+{
+    PacketPtr pkt;
+    RubyRequestType m_type;
+    Cycles issue_time;
+
+    GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type,
+                        Cycles _issue_time)
+        : pkt(_pkt), m_type(_m_type), issue_time(_issue_time)
+    {}
+};
+
+std::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj);
+
+class GPUCoalescer : public RubyPort
+{
+  public:
+    typedef RubyGPUCoalescerParams Params;
+    GPUCoalescer(const Params *);
+    ~GPUCoalescer();
+
+    // Public Methods
+    void wakeup(); // Used only for deadlock detection
+
+    void printProgress(std::ostream& out) const;
+    void resetStats();
+    void collateStats();
+    void regStats();
+
+    void writeCallback(Addr address, DataBlock& data);
+
+    void writeCallback(Addr address,
+                       MachineType mach,
+                       DataBlock& data);
+
+    void writeCallback(Addr address,
+                       MachineType mach,
+                       DataBlock& data,
+                       Cycles initialRequestTime,
+                       Cycles forwardRequestTime,
+                       Cycles firstResponseTime,
+                       bool isRegion);
+
+    void writeCallback(Addr address,
+                       MachineType mach,
+                       DataBlock& data,
+                       Cycles initialRequestTime,
+                       Cycles forwardRequestTime,
+                       Cycles firstResponseTime);
+
+    void readCallback(Addr address, DataBlock& data);
+
+    void readCallback(Addr address,
+                      MachineType mach,
+                      DataBlock& data);
+
+    void readCallback(Addr address,
+                      MachineType mach,
+                      DataBlock& data,
+                      Cycles initialRequestTime,
+                      Cycles forwardRequestTime,
+                      Cycles firstResponseTime);
+
+    void readCallback(Addr address,
+                      MachineType mach,
+                      DataBlock& data,
+                      Cycles initialRequestTime,
+                      Cycles forwardRequestTime,
+                      Cycles firstResponseTime,
+                      bool isRegion);
+    /* atomics need their own callback because the data
+       might be const coming from SLICC */
+    void atomicCallback(Addr address,
+                        MachineType mach,
+                        const DataBlock& data);
+
+    void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
+    void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
+
+    // Alternate implementations in VIPER Coalescer
+    virtual RequestStatus makeRequest(PacketPtr pkt);
+
+    int outstandingCount() const { return m_outstanding_count; }
+
+    bool
+    isDeadlockEventScheduled() const
+    {
+        return deadlockCheckEvent.scheduled();
+    }
+
+    void
+    descheduleDeadlockEvent()
+    {
+        deschedule(deadlockCheckEvent);
+    }
+
+    bool empty() const;
+
+    void print(std::ostream& out) const;
+    void checkCoherence(Addr address);
+
+    void markRemoved();
+    void removeRequest(GPUCoalescerRequest* request);
+    void evictionCallback(Addr address);
+    void completeIssue();
+
+    void insertKernel(int wavefront_id, PacketPtr pkt);
+
+    void recordRequestType(SequencerRequestType requestType);
+    Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
+
+    Stats::Histogram& getLatencyHist() { return m_latencyHist; }
+    Stats::Histogram& getTypeLatencyHist(uint32_t t)
+    { return *m_typeLatencyHist[t]; }
+
+    Stats::Histogram& getMissLatencyHist()
+    { return m_missLatencyHist; }
+    Stats::Histogram& getMissTypeLatencyHist(uint32_t t)
+    { return *m_missTypeLatencyHist[t]; }
+
+    Stats::Histogram& getMissMachLatencyHist(uint32_t t) const
+    { return *m_missMachLatencyHist[t]; }
+
+    Stats::Histogram&
+    getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
+    { return *m_missTypeMachLatencyHist[r][t]; }
+
+    Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const
+    { return *m_IssueToInitialDelayHist[t]; }
+
+    Stats::Histogram&
+    getInitialToForwardDelayHist(const MachineType t) const
+    { return *m_InitialToForwardDelayHist[t]; }
+
+    Stats::Histogram&
+    getForwardRequestToFirstResponseHist(const MachineType t) const
+    { return *m_ForwardToFirstResponseDelayHist[t]; }
+
+    Stats::Histogram&
+    getFirstResponseToCompletionDelayHist(const MachineType t) const
+    { return *m_FirstResponseToCompletionDelayHist[t]; }
+
+  // Changed to protected to enable inheritance by VIPER Coalescer
+  protected:
+    bool tryCacheAccess(Addr addr, RubyRequestType type,
+                        Addr pc, RubyAccessMode access_mode,
+                        int size, DataBlock*& data_ptr);
+    // Alternate implementations in VIPER Coalescer
+    virtual void issueRequest(PacketPtr pkt, RubyRequestType type);
+
+    void kernelCallback(int wavfront_id);
+
+    void hitCallback(GPUCoalescerRequest* request,
+                     MachineType mach,
+                     DataBlock& data,
+                     bool success,
+                     Cycles initialRequestTime,
+                     Cycles forwardRequestTime,
+                     Cycles firstResponseTime,
+                     bool isRegion);
+    void recordMissLatency(GPUCoalescerRequest* request,
+                           MachineType mach,
+                           Cycles initialRequestTime,
+                           Cycles forwardRequestTime,
+                           Cycles firstResponseTime,
+                           bool success, bool isRegion);
+    void completeHitCallback(std::vector<PacketPtr> & mylist, int len);
+    PacketPtr mapAddrToPkt(Addr address);
+
+
+    RequestStatus getRequestStatus(PacketPtr pkt,
+                                   RubyRequestType request_type);
+    bool insertRequest(PacketPtr pkt, RubyRequestType request_type);
+
+    bool handleLlsc(Addr address, GPUCoalescerRequest* request);
+
+    // Private copy constructor and assignment operator
+    GPUCoalescer(const GPUCoalescer& obj);
+    GPUCoalescer& operator=(const GPUCoalescer& obj);
+
+    class IssueEvent : public Event
+    {
+      private:
+        GPUCoalescer *seq;
+      public:
+        IssueEvent(GPUCoalescer *_seq);
+        void process();
+        const char *description() const;
+    };
+
+    IssueEvent issueEvent;
+
+
+  // Changed to protected to enable inheritance by VIPER Coalescer
+  protected:
+    int m_max_outstanding_requests;
+    int m_deadlock_threshold;
+
+    CacheMemory* m_dataCache_ptr;
+    CacheMemory* m_instCache_ptr;
+
+    // The cache access latency for this GPU data cache. This is assessed at the
+    // beginning of each access. This should be very similar to the
+    // implementation in Sequencer() as this is very much like a Sequencer
+    Cycles m_data_cache_hit_latency;
+
+    // We need to track both the primary and secondary request types.
+    // The secondary request type comprises a subset of RubyRequestTypes that
+    // are understood by the L1 Controller. A primary request type can be any
+    // RubyRequestType.
+    enum {PrimaryType, SecondaryType};
+    typedef std::pair<PacketPtr, std::vector<RubyRequestType> > RequestDesc;
+    typedef std::unordered_map<Addr, std::vector<RequestDesc> > CoalescingTable;
+    CoalescingTable reqCoalescer;
+    std::vector<Addr> newRequests;
+
+    typedef std::unordered_map<Addr, GPUCoalescerRequest*> RequestTable;
+    RequestTable m_writeRequestTable;
+    RequestTable m_readRequestTable;
+    // Global outstanding request count, across all request tables
+    int m_outstanding_count;
+    bool m_deadlock_check_scheduled;
+    std::unordered_map<int, PacketPtr> kernelEndList;
+    std::vector<int> newKernelEnds;
+
+    int m_store_waiting_on_load_cycles;
+    int m_store_waiting_on_store_cycles;
+    int m_load_waiting_on_store_cycles;
+    int m_load_waiting_on_load_cycles;
+
+    bool m_usingNetworkTester;
+
+    class GPUCoalescerWakeupEvent : public Event
+    {
+      private:
+        GPUCoalescer *m_GPUCoalescer_ptr;
+
+      public:
+        GPUCoalescerWakeupEvent(GPUCoalescer *_seq) :
+            m_GPUCoalescer_ptr(_seq) {}
+        void process() { m_GPUCoalescer_ptr->wakeup(); }
+        const char *description() const
+        {
+            return "GPUCoalescer deadlock check";
+        }
+    };
+
+    GPUCoalescerWakeupEvent deadlockCheckEvent;
+    bool assumingRfOCoherence;
+
+    // m5 style stats for TCP hit/miss counts
+    Stats::Scalar GPU_TCPLdHits;
+    Stats::Scalar GPU_TCPLdTransfers;
+    Stats::Scalar GPU_TCCLdHits;
+    Stats::Scalar GPU_LdMiss;
+
+    Stats::Scalar GPU_TCPStHits;
+    Stats::Scalar GPU_TCPStTransfers;
+    Stats::Scalar GPU_TCCStHits;
+    Stats::Scalar GPU_StMiss;
+
+    Stats::Scalar CP_TCPLdHits;
+    Stats::Scalar CP_TCPLdTransfers;
+    Stats::Scalar CP_TCCLdHits;
+    Stats::Scalar CP_LdMiss;
+
+    Stats::Scalar CP_TCPStHits;
+    Stats::Scalar CP_TCPStTransfers;
+    Stats::Scalar CP_TCCStHits;
+    Stats::Scalar CP_StMiss;
+
+    //! Histogram for number of outstanding requests per cycle.
+    Stats::Histogram m_outstandReqHist;
+
+    //! Histogram for holding latency profile of all requests.
+    Stats::Histogram m_latencyHist;
+    std::vector<Stats::Histogram *> m_typeLatencyHist;
+
+    //! Histogram for holding latency profile of all requests that
+    //! miss in the controller connected to this sequencer.
+    Stats::Histogram m_missLatencyHist;
+    std::vector<Stats::Histogram *> m_missTypeLatencyHist;
+
+    //! Histograms for profiling the latencies for requests that
+    //! required external messages.
+    std::vector<Stats::Histogram *> m_missMachLatencyHist;
+    std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist;
+
+    //! Histograms for recording the breakdown of miss latency
+    std::vector<Stats::Histogram *> m_IssueToInitialDelayHist;
+    std::vector<Stats::Histogram *> m_InitialToForwardDelayHist;
+    std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
+    std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
+};
+
+inline std::ostream&
+operator<<(std::ostream& out, const GPUCoalescer& obj)
+{
+    obj.print(out);
+    out << std::flush;
+    return out;
+}
+
+#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
+
diff --git a/src/mem/ruby/system/GPUCoalescer.py b/src/mem/ruby/system/GPUCoalescer.py
new file mode 100644
index 000000000..0c19f875d
--- /dev/null
+++ b/src/mem/ruby/system/GPUCoalescer.py
@@ -0,0 +1,48 @@
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Steve Reinhardt
+#          Brad Beckmann
+
+from m5.params import *
+from m5.proxy import *
+from Sequencer import *
+
+class RubyGPUCoalescer(RubySequencer):
+   type = 'RubyGPUCoalescer'
+   cxx_class = 'GPUCoalescer'
+   cxx_header = "mem/ruby/system/GPUCoalescer.hh"
+
+   # max_outstanding_requests = (wave front slots) x (wave front size)
+   max_outstanding_requests = Param.Int(40*64,
+                                "max requests (incl. prefetches) outstanding")
+   assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
+                           "Ownership coherence");
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index 5a5f528bb..bf4002126 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -60,7 +60,8 @@ RubyPort::RubyPort(const Params *p)
       memSlavePort(csprintf("%s-mem-slave-port", name()), this,
                    p->ruby_system->getAccessBackingStore(), -1,
                    p->no_retry_on_stall),
-      gotAddrRanges(p->port_master_connection_count)
+      gotAddrRanges(p->port_master_connection_count),
+      m_isCPUSequencer(p->is_cpu_sequencer)
 {
     assert(m_version != -1);
 
diff --git a/src/mem/ruby/system/RubyPort.hh b/src/mem/ruby/system/RubyPort.hh
index 07e0fde5a..6bd92b654 100644
--- a/src/mem/ruby/system/RubyPort.hh
+++ b/src/mem/ruby/system/RubyPort.hh
@@ -167,6 +167,8 @@ class RubyPort : public MemObject
     uint32_t getId() { return m_version; }
     DrainState drain() override;
 
+    bool isCPUSequencer() { return m_isCPUSequencer; }
+
   protected:
     void trySendRetries();
     void ruby_hit_callback(PacketPtr pkt);
@@ -218,6 +220,8 @@ class RubyPort : public MemObject
     // that should be called when the Sequencer becomes available after a stall.
     //
     std::vector<MemSlavePort *> retryList;
+
+    bool m_isCPUSequencer;
 };
 
 #endif // __MEM_RUBY_SYSTEM_RUBYPORT_HH__
diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc
index 1ecd2e098..e1717e519 100644
--- a/src/mem/ruby/system/RubySystem.cc
+++ b/src/mem/ruby/system/RubySystem.cc
@@ -107,7 +107,7 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
     Sequencer* sequencer_ptr = NULL;
 
     for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
-        sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getSequencer());
+        sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getCPUSequencer());
         if (sequencer_ptr == NULL) {
             sequencer_ptr = sequencer_map[cntrl];
         }
diff --git a/src/mem/ruby/system/SConscript b/src/mem/ruby/system/SConscript
index 8c5077362..b67311bca 100644
--- a/src/mem/ruby/system/SConscript
+++ b/src/mem/ruby/system/SConscript
@@ -33,12 +33,22 @@ Import('*')
 if env['PROTOCOL'] == 'None':
     Return()
 
+if env['BUILD_GPU']:
+    SimObject('GPUCoalescer.py')
 SimObject('RubySystem.py')
 SimObject('Sequencer.py')
+SimObject('WeightedLRUReplacementPolicy.py')
+if env['BUILD_GPU']:
+    SimObject('VIPERCoalescer.py')
 
 Source('CacheRecorder.cc')
 Source('DMASequencer.cc')
+if env['BUILD_GPU']:
+    Source('GPUCoalescer.cc')
 Source('RubyPort.cc')
 Source('RubyPortProxy.cc')
 Source('RubySystem.cc')
 Source('Sequencer.cc')
+if env['BUILD_GPU']:
+    Source('VIPERCoalescer.cc')
+Source('WeightedLRUPolicy.cc')
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 50418c700..c2727b41d 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -63,6 +63,7 @@ Sequencer::Sequencer(const Params *p)
     m_max_outstanding_requests = p->max_outstanding_requests;
     m_deadlock_threshold = p->deadlock_threshold;
 
+    m_coreId = p->coreid; // for tracking the two CorePair sequencers
     assert(m_max_outstanding_requests > 0);
     assert(m_deadlock_threshold > 0);
     assert(m_instCache_ptr != NULL);
@@ -593,6 +594,8 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
     ContextID proc_id = pkt->req->hasContextId() ?
         pkt->req->contextId() : InvalidContextID;
 
+    ContextID core_id = coreId();
+
     // If valid, copy the pc to the ruby request
     Addr pc = 0;
     if (pkt->req->hasPC()) {
@@ -607,7 +610,7 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
                                       nullptr : pkt->getPtr<uint8_t>(),
                                       pkt->getSize(), pc, secondary_type,
                                       RubyAccessMode_Supervisor, pkt,
-                                      PrefetchBit_No, proc_id);
+                                      PrefetchBit_No, proc_id, core_id);
 
     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %#x %s\n",
             curTick(), m_version, "Seq", "Begin", "", "",
diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh
index 47af7ea1e..2a2f49587 100644
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh
@@ -99,6 +99,7 @@ class Sequencer : public RubyPort
     void markRemoved();
     void evictionCallback(Addr address);
     void invalidateSC(Addr address);
+    int coreId() const { return m_coreId; }
 
     void recordRequestType(SequencerRequestType requestType);
     Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
@@ -198,6 +199,8 @@ class Sequencer : public RubyPort
     Stats::Scalar m_load_waiting_on_store;
     Stats::Scalar m_load_waiting_on_load;
 
+    int m_coreId;
+
     bool m_usingNetworkTester;
 
     //! Histogram for number of outstanding requests per cycle.
diff --git a/src/mem/ruby/system/Sequencer.py b/src/mem/ruby/system/Sequencer.py
index 7c90eb29c..d6ee0aa2f 100644
--- a/src/mem/ruby/system/Sequencer.py
+++ b/src/mem/ruby/system/Sequencer.py
@@ -32,54 +32,58 @@ from m5.proxy import *
 from MemObject import MemObject
 
 class RubyPort(MemObject):
-    type = 'RubyPort'
-    abstract = True
-    cxx_header = "mem/ruby/system/RubyPort.hh"
-    version = Param.Int(0, "")
+   type = 'RubyPort'
+   abstract = True
+   cxx_header = "mem/ruby/system/RubyPort.hh"
+   version = Param.Int(0, "")
 
-    slave = VectorSlavePort("CPU slave port")
-    master = VectorMasterPort("CPU master port")
-    pio_master_port = MasterPort("Ruby mem master port")
-    mem_master_port = MasterPort("Ruby mem master port")
-    pio_slave_port = SlavePort("Ruby pio slave port")
-    mem_slave_port = SlavePort("Ruby memory port")
+   slave = VectorSlavePort("CPU slave port")
+   master = VectorMasterPort("CPU master port")
+   pio_master_port = MasterPort("Ruby mem master port")
+   mem_master_port = MasterPort("Ruby mem master port")
+   pio_slave_port = SlavePort("Ruby pio slave port")
+   mem_slave_port = SlavePort("Ruby memory port")
 
-    using_ruby_tester = Param.Bool(False, "")
-    no_retry_on_stall = Param.Bool(False, "")
-    ruby_system = Param.RubySystem(Parent.any, "")
-    system = Param.System(Parent.any, "system object")
-    support_data_reqs = Param.Bool(True, "data cache requests supported")
-    support_inst_reqs = Param.Bool(True, "inst cache requests supported")
+   using_ruby_tester = Param.Bool(False, "")
+   no_retry_on_stall = Param.Bool(False, "")
+   ruby_system = Param.RubySystem(Parent.any, "")
+   system = Param.System(Parent.any, "system object")
+   support_data_reqs = Param.Bool(True, "data cache requests supported")
+   support_inst_reqs = Param.Bool(True, "inst cache requests supported")
+   is_cpu_sequencer = Param.Bool(True, "connected to a cpu")
 
 class RubyPortProxy(RubyPort):
-    type = 'RubyPortProxy'
-    cxx_header = "mem/ruby/system/RubyPortProxy.hh"
+   type = 'RubyPortProxy'
+   cxx_header = "mem/ruby/system/RubyPortProxy.hh"
 
 class RubySequencer(RubyPort):
-    type = 'RubySequencer'
-    cxx_class = 'Sequencer'
-    cxx_header = "mem/ruby/system/Sequencer.hh"
+   type = 'RubySequencer'
+   cxx_class = 'Sequencer'
+   cxx_header = "mem/ruby/system/Sequencer.hh"
 
-    icache = Param.RubyCache("")
-    dcache = Param.RubyCache("")
-    # Cache latencies currently assessed at the beginning of each access
-    # NOTE: Setting these values to a value greater than one will result in
-    # O3 CPU pipeline bubbles and negatively impact performance
-    # TODO: Latencies should be migrated into each top-level cache controller
-    icache_hit_latency = Param.Cycles(1, "Inst cache hit latency")
-    dcache_hit_latency = Param.Cycles(1, "Data cache hit latency")
-    max_outstanding_requests = Param.Int(16,
-        "max requests (incl. prefetches) outstanding")
-    deadlock_threshold = Param.Cycles(500000,
-        "max outstanding cycles for a request before deadlock/livelock declared")
-    using_network_tester = Param.Bool(False, "")
+   icache = Param.RubyCache("")
+   dcache = Param.RubyCache("")
+   # Cache latencies currently assessed at the beginning of each access
+   # NOTE: Setting these values to a value greater than one will result in
+   # O3 CPU pipeline bubbles and negatively impact performance
+   # TODO: Latencies should be migrated into each top-level cache controller
+   icache_hit_latency = Param.Cycles(1, "Inst cache hit latency")
+   dcache_hit_latency = Param.Cycles(1, "Data cache hit latency")
+   max_outstanding_requests = Param.Int(16,
+       "max requests (incl. prefetches) outstanding")
+   deadlock_threshold = Param.Cycles(500000,
+       "max outstanding cycles for a request before deadlock/livelock declared")
+   using_network_tester = Param.Bool(False, "")
+   # id used by protocols that support multiple sequencers per controller
+   # 99 is the dummy default value
+   coreid = Param.Int(99, "CorePair core id")
 
 class DMASequencer(MemObject):
-    type = 'DMASequencer'
-    cxx_header = "mem/ruby/system/DMASequencer.hh"
+   type = 'DMASequencer'
+   cxx_header = "mem/ruby/system/DMASequencer.hh"
 
-    version = Param.Int(0, "")
-    slave = SlavePort("Device slave port")
-    using_ruby_tester = Param.Bool(False, "")
-    ruby_system = Param.RubySystem(Parent.any, "")
-    system = Param.System(Parent.any, "system object")
+   version = Param.Int(0, "")
+   slave = SlavePort("Device slave port")
+   using_ruby_tester = Param.Bool(False, "")
+   ruby_system = Param.RubySystem(Parent.any, "")
+   system = Param.System(Parent.any, "system object")
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc
new file mode 100644
index 000000000..ca91f2723
--- /dev/null
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "base/misc.hh"
+#include "base/str.hh"
+#include "config/the_isa.hh"
+
+#if THE_ISA == X86_ISA
+#include "arch/x86/insts/microldstop.hh"
+
+#endif // X86_ISA
+#include "mem/ruby/system/VIPERCoalescer.hh"
+
+#include "cpu/testers/rubytest/RubyTester.hh"
+#include "debug/GPUCoalescer.hh"
+#include "debug/MemoryAccess.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/common/SubBlock.hh"
+#include "mem/ruby/network/MessageBuffer.hh"
+#include "mem/ruby/profiler/Profiler.hh"
+#include "mem/ruby/slicc_interface/AbstractController.hh"
+#include "mem/ruby/slicc_interface/RubyRequest.hh"
+#include "mem/ruby/structures/CacheMemory.hh"
+#include "mem/ruby/system/GPUCoalescer.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "params/VIPERCoalescer.hh"
+
+using namespace std;
+
+VIPERCoalescer *
+VIPERCoalescerParams::create()
+{
+    return new VIPERCoalescer(this);
+}
+
+VIPERCoalescer::VIPERCoalescer(const Params *p)
+    : GPUCoalescer(p)
+{
+    m_max_wb_per_cycle=p->max_wb_per_cycle;
+    m_max_inv_per_cycle=p->max_inv_per_cycle;
+    m_outstanding_inv = 0;
+    m_outstanding_wb = 0;
+}
+
+VIPERCoalescer::~VIPERCoalescer()
+{
+}
+
+// Analyzes the packet to see if this request can be coalesced.
+// If request can be coalesced, this request is added to the reqCoalescer table
+// and makeRequest returns RequestStatus_Issued;
+// If this is the first request to a cacheline, request is added to both
+// newRequests queue and to the reqCoalescer table; makeRequest
+// returns RequestStatus_Issued.
+// If there is a pending request to this cacheline and this request
+// can't be coalesced, RequestStatus_Aliased is returned and
+// the packet needs to be reissued.
+RequestStatus
+VIPERCoalescer::makeRequest(PacketPtr pkt)
+{
+    if (m_outstanding_wb | m_outstanding_inv) {
+        DPRINTF(GPUCoalescer,
+                "There are %d Writebacks and %d Invalidatons\n",
+                m_outstanding_wb, m_outstanding_inv);
+    }
+    // Are we in the middle of a release
+    if ((m_outstanding_wb) > 0) {
+        if (pkt->req->isKernel()) {
+            // Everythign is fine
+            // Barriers and Kernel End scan coalesce
+            // If it is a Kerenl Begin flush the cache
+            if (pkt->req->isAcquire() && (m_outstanding_inv == 0)) {
+                invL1();
+            }
+
+            if (pkt->req->isRelease()) {
+                insertKernel(pkt->req->contextId(), pkt);
+            }
+
+            return RequestStatus_Issued;
+        }
+//        return RequestStatus_Aliased;
+    } else if (pkt->req->isKernel() && pkt->req->isRelease()) {
+        // Flush Dirty Data on Kernel End
+        // isKernel + isRelease
+        insertKernel(pkt->req->contextId(), pkt);
+        wbL1();
+        if(m_outstanding_wb == 0) {
+            for (auto it =  kernelEndList.begin(); it != kernelEndList.end(); it++) {
+                newKernelEnds.push_back(it->first);
+            }
+            completeIssue();
+        }
+        return RequestStatus_Issued;
+    }
+    RequestStatus requestStatus = GPUCoalescer::makeRequest(pkt);
+    if (requestStatus!=RequestStatus_Issued) {
+        // Request not isssued
+        // enqueue Retry
+        DPRINTF(GPUCoalescer, "Request not issued by GPUCoaleser\n");
+        return requestStatus;
+    } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
+        // Invalidate clean Data on Kernel Begin
+        // isKernel + isAcquire
+        invL1();
+    } else if (pkt->req->isAcquire() && pkt->req->isRelease()) {
+        // Deschedule the AtomicAcqRel and
+        // Flush and Invalidate the L1 cache
+        invwbL1();
+        if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
+            DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
+            deschedule(issueEvent);
+        }
+    } else if (pkt->req->isRelease()) {
+        // Deschedule the StoreRel and
+        // Flush the L1 cache
+        wbL1();
+        if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
+            DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
+            deschedule(issueEvent);
+        }
+    } else if (pkt->req->isAcquire()) {
+        // LoadAcq or AtomicAcq
+        // Invalidate the L1 cache
+        invL1();
+    }
+    // Request was successful
+    if (m_outstanding_wb == 0) {
+        if (!issueEvent.scheduled()) {
+            DPRINTF(GPUCoalescer, "issueEvent Rescheduled\n");
+            schedule(issueEvent, curTick());
+        }
+    }
+    return RequestStatus_Issued;
+}
+
+void
+VIPERCoalescer::wbCallback(Addr addr)
+{
+    m_outstanding_wb--;
+    // if L1 Flush Complete
+    // attemnpt to schedule issueEvent
+    assert(((int) m_outstanding_wb) >= 0);
+    if (m_outstanding_wb == 0) {
+        for (auto it =  kernelEndList.begin(); it != kernelEndList.end(); it++) {
+            newKernelEnds.push_back(it->first);
+        }
+        completeIssue();
+    }
+    trySendRetries();
+}
+
+void
+VIPERCoalescer::invCallback(Addr addr)
+{
+    m_outstanding_inv--;
+    // if L1 Flush Complete
+    // attemnpt to schedule issueEvent
+    // This probably won't happen, since
+    // we dont wait on cache invalidations
+    if (m_outstanding_wb == 0) {
+        for (auto it =  kernelEndList.begin(); it != kernelEndList.end(); it++) {
+            newKernelEnds.push_back(it->first);
+        }
+        completeIssue();
+    }
+    trySendRetries();
+}
+
+/**
+  * Invalidate L1 cache (Acquire)
+  */
+void
+VIPERCoalescer::invL1()
+{
+    int size = m_dataCache_ptr->getNumBlocks();
+    DPRINTF(GPUCoalescer,
+            "There are %d Invalidations outstanding before Cache Walk\n",
+            m_outstanding_inv);
+    // Walk the cache
+    for (int i = 0; i < size; i++) {
+        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+        // Evict Read-only data
+        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+            clockEdge(), addr, (uint8_t*) 0, 0, 0,
+            RubyRequestType_REPLACEMENT, RubyAccessMode_Supervisor,
+            nullptr);
+        assert(m_mandatory_q_ptr != NULL);
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+        m_outstanding_inv++;
+    }
+    DPRINTF(GPUCoalescer,
+            "There are %d Invalidatons outstanding after Cache Walk\n",
+            m_outstanding_inv);
+}
+
+/**
+  * Writeback L1 cache (Release)
+  */
+void
+VIPERCoalescer::wbL1()
+{
+    int size = m_dataCache_ptr->getNumBlocks();
+    DPRINTF(GPUCoalescer,
+            "There are %d Writebacks outstanding before Cache Walk\n",
+            m_outstanding_wb);
+    // Walk the cache
+    for (int i = 0; i < size; i++) {
+        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+        // Write dirty data back
+        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+            clockEdge(), addr, (uint8_t*) 0, 0, 0,
+            RubyRequestType_FLUSH, RubyAccessMode_Supervisor,
+            nullptr);
+        assert(m_mandatory_q_ptr != NULL);
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+        m_outstanding_wb++;
+    }
+    DPRINTF(GPUCoalescer,
+            "There are %d Writebacks outstanding after Cache Walk\n",
+            m_outstanding_wb);
+}
+
+/**
+  * Invalidate and Writeback L1 cache (Acquire&Release)
+  */
+void
+VIPERCoalescer::invwbL1()
+{
+    int size = m_dataCache_ptr->getNumBlocks();
+    // Walk the cache
+    for(int i = 0; i < size; i++) {
+        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+        // Evict Read-only data
+        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+            clockEdge(), addr, (uint8_t*) 0, 0, 0,
+            RubyRequestType_REPLACEMENT, RubyAccessMode_Supervisor,
+            nullptr);
+        assert(m_mandatory_q_ptr != NULL);
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+        m_outstanding_inv++;
+    }
+    // Walk the cache
+    for(int i = 0; i< size; i++) {
+        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+        // Write dirty data back
+        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+            clockEdge(), addr, (uint8_t*) 0, 0, 0,
+            RubyRequestType_FLUSH, RubyAccessMode_Supervisor,
+            nullptr);
+        assert(m_mandatory_q_ptr != NULL);
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+        m_outstanding_wb++;
+    }
+}
diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh
new file mode 100644
index 000000000..af6e44e7f
--- /dev/null
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __MEM_RUBY_SYSTEM_VI_COALESCER_HH__
+#define __MEM_RUBY_SYSTEM_VI_COALESCER_HH__
+
+#include <iostream>
+
+#include "mem/protocol/PrefetchBit.hh"
+#include "mem/protocol/RubyAccessMode.hh"
+#include "mem/protocol/RubyRequestType.hh"
+#include "mem/ruby/common/Address.hh"
+#include "mem/ruby/common/Consumer.hh"
+#include "mem/ruby/system/GPUCoalescer.hh"
+#include "mem/ruby/system/RubyPort.hh"
+
+class DataBlock;
+class CacheMsg;
+class MachineID;
+class CacheMemory;
+
+class VIPERCoalescerParams;
+
+class VIPERCoalescer : public GPUCoalescer
+{
+  public:
+    typedef VIPERCoalescerParams Params;
+    VIPERCoalescer(const Params *);
+    ~VIPERCoalescer();
+    void wbCallback(Addr address);
+    void invCallback(Addr address);
+    RequestStatus makeRequest(PacketPtr pkt);
+  private:
+    void invL1();
+    void wbL1();
+    void invwbL1();
+    uint64_t m_outstanding_inv;
+    uint64_t m_outstanding_wb;
+    uint64_t m_max_inv_per_cycle;
+    uint64_t m_max_wb_per_cycle;
+};
+#endif // __MEM_RUBY_SYSTEM_VI_COALESCER_HH__
+
diff --git a/src/mem/ruby/system/VIPERCoalescer.py b/src/mem/ruby/system/VIPERCoalescer.py
new file mode 100644
index 000000000..05c74386f
--- /dev/null
+++ b/src/mem/ruby/system/VIPERCoalescer.py
@@ -0,0 +1,45 @@
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Steve Reinhardt
+#          Brad Beckmann
+
+from m5.params import *
+from m5.proxy import *
+from GPUCoalescer import *
+
+class VIPERCoalescer(RubyGPUCoalescer):
+    type = 'VIPERCoalescer'
+    cxx_class = 'VIPERCoalescer'
+    cxx_header = "mem/ruby/system/VIPERCoalescer.hh"
+    max_inv_per_cycle = Param.Int(32, "max invalidations per cycle")
+    max_wb_per_cycle = Param.Int(32, "max writebacks per cycle")
+    assume_rfo = False
diff --git a/src/mem/ruby/system/WeightedLRUPolicy.cc b/src/mem/ruby/system/WeightedLRUPolicy.cc
new file mode 100644
index 000000000..5baa4d9a5
--- /dev/null
+++ b/src/mem/ruby/system/WeightedLRUPolicy.cc
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Derek Hower
+ */
+
+#include "mem/ruby/system/WeightedLRUPolicy.hh"
+
+WeightedLRUPolicy::WeightedLRUPolicy(const Params* p)
+    : AbstractReplacementPolicy(p), m_cache(p->cache)
+{
+    m_last_occ_ptr = new int*[m_num_sets];
+    for(unsigned i = 0; i < m_num_sets; i++){
+        m_last_occ_ptr[i] = new int[m_assoc];
+        for(unsigned j = 0; j < m_assoc; j++){
+            m_last_occ_ptr[i][j] = 0;
+        }
+    }
+}
+
+WeightedLRUPolicy *
+WeightedLRUReplacementPolicyParams::create()
+{
+    return new WeightedLRUPolicy(this);
+}
+
+WeightedLRUPolicy::~WeightedLRUPolicy()
+{
+    if (m_last_occ_ptr != NULL){
+        for (unsigned i = 0; i < m_num_sets; i++){
+            if (m_last_occ_ptr[i] != NULL){
+                delete[] m_last_occ_ptr[i];
+            }
+        }
+        delete[] m_last_occ_ptr;
+    }
+}
+
+void
+WeightedLRUPolicy::touch(int64_t set, int64_t index, Tick time)
+{
+    assert(index >= 0 && index < m_assoc);
+    assert(set >= 0 && set < m_num_sets);
+
+    m_last_ref_ptr[set][index] = time;
+}
+
+void
+WeightedLRUPolicy::touch(int64_t set, int64_t index, Tick time, int occupancy)
+{
+    assert(index >= 0 && index < m_assoc);
+    assert(set >= 0 && set < m_num_sets);
+
+    m_last_ref_ptr[set][index] = time;
+    m_last_occ_ptr[set][index] = occupancy;
+}
+
+int64_t
+WeightedLRUPolicy::getVictim(int64_t set) const
+{
+    Tick time, smallest_time;
+    int64_t smallest_index;
+
+    smallest_index = 0;
+    smallest_time = m_last_ref_ptr[set][0];
+    int smallest_weight = m_last_ref_ptr[set][0];
+
+    for (unsigned i = 1; i < m_assoc; i++) {
+
+        int weight = m_last_occ_ptr[set][i];
+        if (weight < smallest_weight) {
+            smallest_weight = weight;
+            smallest_index = i;
+            smallest_time = m_last_ref_ptr[set][i];
+        } else if (weight == smallest_weight) {
+            time = m_last_ref_ptr[set][i];
+            if (time < smallest_time) {
+                smallest_index = i;
+                smallest_time = time;
+            }
+        }
+    }
+    return smallest_index;
+}
diff --git a/src/mem/ruby/system/WeightedLRUPolicy.hh b/src/mem/ruby/system/WeightedLRUPolicy.hh
new file mode 100644
index 000000000..3150779b2
--- /dev/null
+++ b/src/mem/ruby/system/WeightedLRUPolicy.hh
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __MEM_RUBY_SYSTEM_WEIGHTEDLRUPOLICY_HH__
+#define __MEM_RUBY_SYSTEM_WEIGHTEDLRUPOLICY_HH__
+
+#include "mem/ruby/structures/AbstractReplacementPolicy.hh"
+#include "mem/ruby/structures/CacheMemory.hh"
+#include "params/WeightedLRUReplacementPolicy.hh"
+
+/* Simple true LRU replacement policy */
+
+class WeightedLRUPolicy : public AbstractReplacementPolicy
+{
+  public:
+    typedef WeightedLRUReplacementPolicyParams Params;
+    WeightedLRUPolicy(const Params* p);
+    ~WeightedLRUPolicy();
+
+    void touch(int64_t set, int64_t way, Tick time);
+    void touch(int64_t set, int64_t way, Tick time, int occupancy);
+    int64_t getVictim(int64_t set) const override;
+
+    bool useOccupancy() const { return true; }
+
+    CacheMemory * m_cache;
+    int **m_last_occ_ptr;
+};
+
+#endif // __MEM_RUBY_SYSTEM_WeightedLRUPolicy_HH__
diff --git a/src/mem/ruby/system/WeightedLRUReplacementPolicy.py b/src/mem/ruby/system/WeightedLRUReplacementPolicy.py
new file mode 100644
index 000000000..e7de33496
--- /dev/null
+++ b/src/mem/ruby/system/WeightedLRUReplacementPolicy.py
@@ -0,0 +1,45 @@
+#
+#  Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Derek Hower
+#
+
+from m5.params import *
+from m5.proxy import *
+from MemObject import MemObject
+from ReplacementPolicy import ReplacementPolicy
+
+class WeightedLRUReplacementPolicy(ReplacementPolicy):
+    type = "WeightedLRUReplacementPolicy"
+    cxx_class = "WeightedLRUPolicy"
+    cxx_header = "mem/ruby/system/WeightedLRUPolicy.hh"
+    cache = Param.RubyCache("")
diff --git a/src/mem/slicc/symbols/StateMachine.py b/src/mem/slicc/symbols/StateMachine.py
index a530307ee..fc3f32c3d 100644
--- a/src/mem/slicc/symbols/StateMachine.py
+++ b/src/mem/slicc/symbols/StateMachine.py
@@ -35,13 +35,17 @@ import re
 
 python_class_map = {
                     "int": "Int",
+                    "NodeID": "Int",
                     "uint32_t" : "UInt32",
                     "std::string": "String",
                     "bool": "Bool",
                     "CacheMemory": "RubyCache",
                     "WireBuffer": "RubyWireBuffer",
                     "Sequencer": "RubySequencer",
+                    "GPUCoalescer" : "RubyGPUCoalescer",
+                    "VIPERCoalescer" : "VIPERCoalescer",
                     "DirectoryMemory": "RubyDirectoryMemory",
+                    "PerfectCacheMemory": "RubyPerfectCacheMemory",
                     "MemoryControl": "MemoryControl",
                     "MessageBuffer": "MessageBuffer",
                     "DMASequencer": "DMASequencer",
@@ -305,7 +309,7 @@ class $c_ident : public AbstractController
     void collateStats();
 
     void recordCacheTrace(int cntrl, CacheRecorder* tr);
-    Sequencer* getSequencer() const;
+    Sequencer* getCPUSequencer() const;
 
     int functionalWriteBuffers(PacketPtr&);
 
@@ -527,8 +531,14 @@ $c_ident::$c_ident(const Params *p)
             else:
                 code('m_${{param.ident}} = p->${{param.ident}};')
 
-            if re.compile("sequencer").search(param.ident):
-                code('m_${{param.ident}}_ptr->setController(this);')
+            if re.compile("sequencer").search(param.ident) or \
+                   param.type_ast.type.c_ident == "GPUCoalescer" or \
+                   param.type_ast.type.c_ident == "VIPERCoalescer":
+                code('''
+if (m_${{param.ident}}_ptr != NULL) {
+    m_${{param.ident}}_ptr->setController(this);
+}
+''')
 
         code('''
 
@@ -670,6 +680,28 @@ $c_ident::init()
                 assert(param.pointer)
                 seq_ident = "m_%s_ptr" % param.ident
 
+        if seq_ident != "NULL":
+            code('''
+Sequencer*
+$c_ident::getCPUSequencer() const
+{
+    if (NULL != $seq_ident && $seq_ident->isCPUSequencer()) {
+        return $seq_ident;
+    } else {
+        return NULL;
+    }
+}
+''')
+        else:
+            code('''
+
+Sequencer*
+$c_ident::getCPUSequencer() const
+{
+    return NULL;
+}
+''')
+
         code('''
 
 void
@@ -796,12 +828,6 @@ $c_ident::getMemoryQueue() const
     return $memq_ident;
 }
 
-Sequencer*
-$c_ident::getSequencer() const
-{
-    return $seq_ident;
-}
-
 void
 $c_ident::print(ostream& out) const
 {
diff --git a/tests/SConscript b/tests/SConscript
index e9c9a6432..886b7fe59 100644
--- a/tests/SConscript
+++ b/tests/SConscript
@@ -348,20 +348,26 @@ if env['TARGET_ISA'] == 'arm':
                 'realview64-switcheroo-timing',
                 'realview64-switcheroo-o3',
                 'realview64-switcheroo-full']
-if env['TARGET_ISA'] == 'x86':
+if env['TARGET_ISA'] == 'x86' and not env['BUILD_GPU']:
     configs += ['pc-simple-atomic',
                 'pc-simple-timing',
                 'pc-o3-timing',
                 'pc-switcheroo-full']
 
-configs += ['simple-atomic', 'simple-atomic-mp',
-            'simple-timing', 'simple-timing-mp',
-            'minor-timing', 'minor-timing-mp',
-            'o3-timing', 'o3-timing-mt', 'o3-timing-mp',
-            'rubytest', 'memtest', 'memtest-filter',
-            'tgen-simple-mem', 'tgen-dram-ctrl']
-
-configs += ['learning-gem5-p1-simple', 'learning-gem5-p1-two-level']
+if env['TARGET_ISA'] == 'x86' and env['BUILD_GPU'] and \
+   env['TARGET_GPU_ISA'] == 'hsail':
+    configs += ['gpu']
+    if env['PROTOCOL'] == 'GPU_RfO':
+        configs += ['gpu-randomtest']
+else:
+    configs += ['simple-atomic', 'simple-atomic-mp',
+                'simple-timing', 'simple-timing-mp',
+                'minor-timing', 'minor-timing-mp',
+                'o3-timing', 'o3-timing-mt', 'o3-timing-mp',
+                'rubytest', 'memtest', 'memtest-filter',
+                'tgen-simple-mem', 'tgen-dram-ctrl']
+
+    configs += ['learning-gem5-p1-simple', 'learning-gem5-p1-two-level']
 
 if env['PROTOCOL'] != 'None':
     if env['PROTOCOL'] == 'MI_example':
diff --git a/tests/configs/gpu-randomtest-ruby.py b/tests/configs/gpu-randomtest-ruby.py
new file mode 100644
index 000000000..92e300394
--- /dev/null
+++ b/tests/configs/gpu-randomtest-ruby.py
@@ -0,0 +1,151 @@
+#
+#  Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Brad Beckmann
+#
+
+import m5
+from m5.objects import *
+from m5.defines import buildEnv
+from m5.util import addToPath
+import os, optparse, sys
+
+# Get paths we might need.  It's expected this file is in m5/configs/example.
+config_path = os.path.dirname(os.path.abspath(__file__))
+config_root = os.path.dirname(config_path)
+m5_root = os.path.dirname(config_root)
+addToPath(config_root+'/configs/common')
+addToPath(config_root+'/configs/ruby')
+addToPath(config_root+'/configs/topologies')
+
+import Ruby
+import Options
+
+parser = optparse.OptionParser()
+Options.addCommonOptions(parser)
+
+# add the gpu specific options expected by the the gpu and gpu_RfO
+parser.add_option("-u", "--num-compute-units", type="int", default=8,
+                  help="number of compute units in the GPU")
+parser.add_option("--numCPs", type="int", default=0,
+                  help="Number of GPU Command Processors (CP)")
+parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
+                  "per CU")
+parser.add_option("--wf-size", type="int", default=64,
+                  help="Wavefront size(in workitems)")
+parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
+                  "WF slots per SIMD")
+
+# Add the ruby specific and protocol specific options
+Ruby.define_options(parser)
+
+(options, args) = parser.parse_args()
+
+#
+# Set the default cache size and associativity to be very small to encourage
+# races between requests and writebacks.
+#
+options.l1d_size="256B"
+options.l1i_size="256B"
+options.l2_size="512B"
+options.l3_size="1kB"
+options.l1d_assoc=2
+options.l1i_assoc=2
+options.l2_assoc=2
+options.l3_assoc=2
+options.num_compute_units=8
+options.num_sqc=2
+
+# Check to for the GPU_RfO protocol.  Other GPU protocols are non-SC and will
+# not work with the Ruby random tester.
+assert(buildEnv['PROTOCOL'] == 'GPU_RfO')
+
+#
+# create the tester and system, including ruby
+#
+tester = RubyTester(check_flush = False, checks_to_complete = 100,
+                    wakeup_frequency = 10, num_cpus = options.num_cpus)
+
+# We set the testers as cpu for ruby to find the correct clock domains
+# for the L1 Objects.
+system = System(cpu = tester)
+
+# Dummy voltage domain for all our clock domains
+system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
+system.clk_domain = SrcClockDomain(clock = '1GHz',
+                                   voltage_domain = system.voltage_domain)
+
+system.mem_ranges = AddrRange('256MB')
+
+Ruby.create_system(options, False, system)
+
+# Create a separate clock domain for Ruby
+system.ruby.clk_domain = SrcClockDomain(clock = '1GHz',
+                                        voltage_domain = system.voltage_domain)
+
+tester.num_cpus = len(system.ruby._cpu_ports)
+
+#
+# The tester is most effective when randomization is turned on and
+# artifical delay is randomly inserted on messages
+#
+system.ruby.randomization = True
+
+for ruby_port in system.ruby._cpu_ports:
+    #
+    # Tie the ruby tester ports to the ruby cpu read and write ports
+    #
+    if ruby_port.support_data_reqs and ruby_port.support_inst_reqs:
+        tester.cpuInstDataPort = ruby_port.slave
+    elif ruby_port.support_data_reqs:
+        tester.cpuDataPort = ruby_port.slave
+    elif ruby_port.support_inst_reqs:
+        tester.cpuInstPort = ruby_port.slave
+
+    # Do not automatically retry stalled Ruby requests
+    ruby_port.no_retry_on_stall = True
+
+    #
+    # Tell the sequencer this is the ruby tester so that it
+    # copies the subblock back to the checker
+    #
+    ruby_port.using_ruby_tester = True
+
+# -----------------------
+# run simulation
+# -----------------------
+
+root = Root(full_system = False, system = system )
+root.system.mem_mode = 'timing'
+
+# Not much point in this being higher than the L1 latency
+m5.ticks.setGlobalFrequency('1ns')
diff --git a/tests/configs/gpu-ruby.py b/tests/configs/gpu-ruby.py
new file mode 100644
index 000000000..632b4dec0
--- /dev/null
+++ b/tests/configs/gpu-ruby.py
@@ -0,0 +1,353 @@
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Brad Beckmann
+#
+
+import m5
+from m5.objects import *
+from m5.defines import buildEnv
+from m5.util import addToPath
+import os, optparse, sys, math, glob
+
+# Get paths we might need
+config_path = os.path.dirname(os.path.abspath(__file__))
+config_root = os.path.dirname(config_path)
+addToPath(config_root+'/configs/common')
+addToPath(config_root+'/configs/ruby')
+addToPath(config_root+'/configs/topologies')
+
+import Ruby
+import Options
+import GPUTLBOptions, GPUTLBConfig
+
+########################## Script Options ########################
+def setOption(parser, opt_str, value = 1):
+    # check to make sure the option actually exists
+    if not parser.has_option(opt_str):
+        raise Exception("cannot find %s in list of possible options" % opt_str)
+
+    opt = parser.get_option(opt_str)
+    # set the value
+    exec("parser.values.%s = %s" % (opt.dest, value))
+
+def getOption(parser, opt_str):
+    # check to make sure the option actually exists
+    if not parser.has_option(opt_str):
+        raise Exception("cannot find %s in list of possible options" % opt_str)
+
+    opt = parser.get_option(opt_str)
+    # get the value
+    exec("return_value = parser.values.%s" % opt.dest)
+    return return_value
+
+def run_test(root):
+    """gpu test requires a specialized run_test implementation to set up the
+    mmio space."""
+
+    # instantiate configuration
+    m5.instantiate()
+
+    # Now that the system has been constructed, setup the mmio space
+    root.system.cpu[0].workload[0].map(0x10000000, 0x200000000, 4096)
+
+    # simulate until program terminates
+    exit_event = m5.simulate(maxtick)
+    print 'Exiting @ tick', m5.curTick(), 'because', exit_event.getCause()
+
+parser = optparse.OptionParser()
+Options.addCommonOptions(parser)
+Options.addSEOptions(parser)
+
+parser.add_option("-k", "--kernel-files",
+                  help="file(s) containing GPU kernel code (colon separated)")
+parser.add_option("-u", "--num-compute-units", type="int", default=2,
+                  help="number of GPU compute units"),
+parser.add_option("--numCPs", type="int", default=0,
+                  help="Number of GPU Command Processors (CP)")
+parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
+                  "per CU")
+parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \
+                  "sharing an SQC (icache, and thus icache TLB)")
+parser.add_option("--wf-size", type="int", default=64,
+                  help="Wavefront size(in workitems)")
+parser.add_option("--wfs-per-simd", type="int", default=8, help="Number of " \
+                  "WF slots per SIMD")
+parser.add_option("--sp-bypass-path-length", type="int", default=4, \
+                  help="Number of stages of bypass path in vector ALU for Single "\
+                  "Precision ops")
+parser.add_option("--dp-bypass-path-length", type="int", default=4, \
+                  help="Number of stages of bypass path in vector ALU for Double "\
+                  "Precision ops")
+parser.add_option("--issue-period", type="int", default=4, \
+                  help="Number of cycles per vector instruction issue period")
+parser.add_option("--glbmem-wr-bus-width", type="int", default=32, \
+                  help="VGPR to Coalescer (Global Memory) data bus width in bytes")
+parser.add_option("--glbmem-rd-bus-width", type="int", default=32, \
+                  help="Coalescer to VGPR (Global Memory) data bus width in bytes")
+parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \
+                  help="Number of Shared Memory pipelines per CU")
+parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \
+                  help="Number of Global Memory pipelines per CU")
+parser.add_option("--vreg-file-size", type="int", default=2048,
+                  help="number of physical vector registers per SIMD")
+parser.add_option("--bw-scalor", type="int", default=0,
+                  help="bandwidth scalor for scalability analysis")
+parser.add_option("--CPUClock", type="string", default="2GHz",
+                  help="CPU clock")
+parser.add_option("--GPUClock", type="string", default="1GHz",
+                  help="GPU clock")
+parser.add_option("--cpu-voltage", action="store", type="string",
+                  default='1.0V',
+                  help = """CPU  voltage domain""")
+parser.add_option("--gpu-voltage", action="store", type="string",
+                  default='1.0V',
+                  help = """CPU  voltage domain""")
+parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST",
+                  help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)")
+parser.add_option("--xact-cas-mode", action="store_true",
+                  help="enable load_compare mode (transactional CAS)")
+parser.add_option("--SegFaultDebug",action="store_true",
+                 help="checks for GPU seg fault before TLB access")
+parser.add_option("--LocalMemBarrier",action="store_true",
+                 help="Barrier does not wait for writethroughs to complete")
+parser.add_option("--countPages", action="store_true",
+                 help="Count Page Accesses and output in per-CU output files")
+parser.add_option("--TLB-prefetch", type="int", help = "prefetch depth for"\
+                  "TLBs")
+parser.add_option("--pf-type", type="string", help="type of prefetch: "\
+                  "PF_CU, PF_WF, PF_PHASE, PF_STRIDE")
+parser.add_option("--pf-stride", type="int", help="set prefetch stride")
+parser.add_option("--numLdsBanks", type="int", default=32,
+                  help="number of physical banks per LDS module")
+parser.add_option("--ldsBankConflictPenalty", type="int", default=1,
+                  help="number of cycles per LDS bank conflict")
+
+# Add the ruby specific and protocol specific options
+Ruby.define_options(parser)
+
+GPUTLBOptions.tlb_options(parser)
+
+(options, args) = parser.parse_args()
+
+# The GPU cache coherence protocols only work with the backing store
+setOption(parser, "--access-backing-store")
+
+# Currently, the sqc (I-Cache of GPU) is shared by
+# multiple compute units(CUs). The protocol works just fine
+# even if sqc is not shared. Overriding this option here
+# so that the user need not explicitly set this (assuming
+# sharing sqc is the common usage)
+n_cu = options.num_compute_units
+num_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc))
+options.num_sqc = num_sqc # pass this to Ruby
+
+########################## Creating the GPU system ########################
+# shader is the GPU
+shader = Shader(n_wf = options.wfs_per_simd,
+                clk_domain = SrcClockDomain(
+                    clock = options.GPUClock,
+                    voltage_domain = VoltageDomain(
+                        voltage = options.gpu_voltage)),
+                timing = True)
+
+# GPU_RfO(Read For Ownership) implements SC/TSO memory model.
+# Other GPU protocols implement release consistency at GPU side.
+# So, all GPU protocols other than GPU_RfO should make their writes
+# visible to the global memory and should read from global memory
+# during kernal boundary. The pipeline initiates(or do not initiate)
+# the acquire/release operation depending on this impl_kern_boundary_sync
+# flag. This flag=true means pipeline initiates a acquire/release operation
+# at kernel boundary.
+if buildEnv['PROTOCOL'] == 'GPU_RfO':
+    shader.impl_kern_boundary_sync = False
+else:
+    shader.impl_kern_boundary_sync = True
+
+# Switching off per-lane TLB by default
+per_lane = False
+if options.TLB_config == "perLane":
+    per_lane = True
+
+# List of compute units; one GPU can have multiple compute units
+compute_units = []
+for i in xrange(n_cu):
+    compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane,
+                                     num_SIMDs = options.simds_per_cu,
+                                     wfSize = options.wf_size,
+                                     spbypass_pipe_length = \
+                                     options.sp_bypass_path_length,
+                                     dpbypass_pipe_length = \
+                                     options.dp_bypass_path_length,
+                                     issue_period = options.issue_period,
+                                     coalescer_to_vrf_bus_width = \
+                                     options.glbmem_rd_bus_width,
+                                     vrf_to_coalescer_bus_width = \
+                                     options.glbmem_wr_bus_width,
+                                     num_global_mem_pipes = \
+                                     options.glb_mem_pipes_per_cu,
+                                     num_shared_mem_pipes = \
+                                     options.shr_mem_pipes_per_cu,
+                                     n_wf = options.wfs_per_simd,
+                                     execPolicy = options.CUExecPolicy,
+                                     xactCasMode = options.xact_cas_mode,
+                                     debugSegFault = options.SegFaultDebug,
+                                     functionalTLB = True,
+                                     localMemBarrier = options.LocalMemBarrier,
+                                     countPages = options.countPages,
+                                     localDataStore = \
+                                     LdsState(banks = options.numLdsBanks,
+                                              bankConflictPenalty = \
+                                              options.ldsBankConflictPenalty)))
+    wavefronts = []
+    vrfs = []
+    for j in xrange(options.simds_per_cu):
+        for k in xrange(shader.n_wf):
+            wavefronts.append(Wavefront(simdId = j, wf_slot_id = k))
+        vrfs.append(VectorRegisterFile(simd_id=j,
+                              num_regs_per_simd=options.vreg_file_size))
+    compute_units[-1].wavefronts = wavefronts
+    compute_units[-1].vector_register_file = vrfs
+    if options.TLB_prefetch:
+        compute_units[-1].prefetch_depth = options.TLB_prefetch
+        compute_units[-1].prefetch_prev_type = options.pf_type
+
+    # attach the LDS and the CU to the bus (actually a Bridge)
+    compute_units[-1].ldsPort = compute_units[-1].ldsBus.slave
+    compute_units[-1].ldsBus.master = compute_units[-1].localDataStore.cuPort
+
+# Attach compute units to GPU
+shader.CUs = compute_units
+
+# this is a uniprocessor only test, thus the shader is the second index in the
+# list of "system.cpus"
+options.num_cpus = 1
+shader_idx = 1
+cpu = TimingSimpleCPU(cpu_id=0)
+
+########################## Creating the GPU dispatcher ########################
+# Dispatcher dispatches work from host CPU to GPU
+host_cpu = cpu
+dispatcher = GpuDispatcher()
+
+# Currently does not test for command processors
+cpu_list = [cpu] + [shader] + [dispatcher]
+
+system = System(cpu = cpu_list,
+                mem_ranges = [AddrRange(options.mem_size)],
+                mem_mode = 'timing')
+
+# Dummy voltage domain for all our clock domains
+system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
+system.clk_domain = SrcClockDomain(clock = '1GHz',
+                                   voltage_domain = system.voltage_domain)
+
+# Create a seperate clock domain for components that should run at
+# CPUs frequency
+system.cpu[0].clk_domain = SrcClockDomain(clock = '2GHz',
+                                          voltage_domain = \
+                                          system.voltage_domain)
+
+# configure the TLB hierarchy
+GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx)
+
+# create Ruby system
+system.piobus = IOXBar(width=32, response_latency=0,
+                       frontend_latency=0, forward_latency=0)
+Ruby.create_system(options, None, system)
+
+# Create a separate clock for Ruby
+system.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock,
+                                        voltage_domain = system.voltage_domain)
+
+# create the interrupt controller
+cpu.createInterruptController()
+
+#
+# Tie the cpu cache ports to the ruby cpu ports and
+# physmem, respectively
+#
+cpu.connectAllPorts(system.ruby._cpu_ports[0])
+system.ruby._cpu_ports[0].mem_master_port = system.piobus.slave
+
+# attach CU ports to Ruby
+# Because of the peculiarities of the CP core, you may have 1 CPU but 2
+# sequencers and thus 2 _cpu_ports created. Your GPUs shouldn't be
+# hooked up until after the CP. To make this script generic, figure out
+# the index as below, but note that this assumes there is one sequencer
+# per compute unit and one sequencer per SQC for the math to work out
+# correctly.
+gpu_port_idx = len(system.ruby._cpu_ports) \
+               - options.num_compute_units - options.num_sqc
+gpu_port_idx = gpu_port_idx - options.numCPs * 2
+
+wavefront_size = options.wf_size
+for i in xrange(n_cu):
+    # The pipeline issues wavefront_size number of uncoalesced requests
+    # in one GPU issue cycle. Hence wavefront_size mem ports.
+    for j in xrange(wavefront_size):
+        system.cpu[shader_idx].CUs[i].memory_port[j] = \
+                  system.ruby._cpu_ports[gpu_port_idx].slave[j]
+    gpu_port_idx += 1
+
+for i in xrange(n_cu):
+    if i > 0 and not i % options.cu_per_sqc:
+        gpu_port_idx += 1
+    system.cpu[shader_idx].CUs[i].sqc_port = \
+            system.ruby._cpu_ports[gpu_port_idx].slave
+gpu_port_idx = gpu_port_idx + 1
+
+assert(options.numCPs == 0)
+
+# connect dispatcher to the system.piobus
+dispatcher.pio = system.piobus.master
+dispatcher.dma = system.piobus.slave
+
+################# Connect the CPU and GPU via GPU Dispatcher ###################
+# CPU rings the GPU doorbell to notify a pending task
+# using this interface.
+# And GPU uses this interface to notify the CPU of task completion
+# The communcation happens through emulated driver.
+
+# Note this implicit setting of the cpu_pointer, shader_pointer and tlb array
+# parameters must be after the explicit setting of the System cpu list
+shader.cpu_pointer = host_cpu
+dispatcher.cpu = host_cpu
+dispatcher.shader_pointer = shader
+
+# -----------------------
+# run simulation
+# -----------------------
+
+root = Root(full_system = False, system = system)
+m5.ticks.setGlobalFrequency('1THz')
+root.system.mem_mode = 'timing'
diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/config.ini b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/config.ini
new file mode 100644
index 000000000..5486af826
--- /dev/null
+++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/config.ini
@@ -0,0 +1,4423 @@
+[root]
+type=Root
+children=system
+eventq_index=0
+full_system=false
+sim_quantum=0
+time_sync_enable=false
+time_sync_period=100000000000
+time_sync_spin_threshold=100000000
+
+[system]
+type=System
+children=clk_domain cp_cntrl0 cpu0 cpu1 cpu2 dir_cntrl0 dispatcher_coalescer dispatcher_tlb dvfs_handler l1_coalescer0 l1_coalescer1 l1_tlb0 l1_tlb1 l2_coalescer l2_tlb l3_coalescer l3_tlb mem_ctrls piobus ruby sqc_cntrl0 sqc_coalescer sqc_tlb sys_port_proxy tcc_cntrl0 tccdir_cntrl0 tcp_cntrl0 tcp_cntrl1 voltage_domain
+boot_osflags=a
+cache_line_size=64
+clk_domain=system.clk_domain
+eventq_index=0
+exit_on_work_items=false
+init_param=0
+kernel=
+kernel_addr_check=true
+load_addr_mask=1099511627775
+load_offset=0
+mem_mode=timing
+mem_ranges=0:536870911
+memories=system.mem_ctrls system.ruby.phys_mem
+mmap_using_noreserve=false
+multi_thread=false
+num_work_ids=16
+readfile=
+symbolfile=
+work_begin_ckpt_count=0
+work_begin_cpu_id_exit=-1
+work_begin_exit_count=0
+work_cpus_ckpt_count=0
+work_end_ckpt_count=0
+work_end_exit_count=0
+work_item_id=-1
+system_port=system.sys_port_proxy.slave[0]
+
+[system.clk_domain]
+type=SrcClockDomain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.voltage_domain
+
+[system.cp_cntrl0]
+type=CorePair_Controller
+children=L1D0cache L1D1cache L1Icache L2cache mandatoryQueue probeToCore requestFromCore responseFromCore responseToCore sequencer sequencer1 triggerQueue unblockFromCore
+L1D0cache=system.cp_cntrl0.L1D0cache
+L1D1cache=system.cp_cntrl0.L1D1cache
+L1Icache=system.cp_cntrl0.L1Icache
+L2cache=system.cp_cntrl0.L2cache
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+eventq_index=0
+issue_latency=15
+l2_hit_latency=18
+mandatoryQueue=system.cp_cntrl0.mandatoryQueue
+number_of_TBEs=256
+probeToCore=system.cp_cntrl0.probeToCore
+recycle_latency=10
+requestFromCore=system.cp_cntrl0.requestFromCore
+responseFromCore=system.cp_cntrl0.responseFromCore
+responseToCore=system.cp_cntrl0.responseToCore
+ruby_system=system.ruby
+send_evictions=true
+sequencer=system.cp_cntrl0.sequencer
+sequencer1=system.cp_cntrl0.sequencer1
+system=system
+transitions_per_cycle=32
+triggerQueue=system.cp_cntrl0.triggerQueue
+unblockFromCore=system.cp_cntrl0.unblockFromCore
+version=0
+
+[system.cp_cntrl0.L1D0cache]
+type=RubyCache
+children=replacement_policy
+assoc=2
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=1
+eventq_index=0
+is_icache=false
+replacement_policy=system.cp_cntrl0.L1D0cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=65536
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=1
+
+[system.cp_cntrl0.L1D0cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=2
+block_size=64
+eventq_index=0
+size=65536
+
+[system.cp_cntrl0.L1D1cache]
+type=RubyCache
+children=replacement_policy
+assoc=2
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=1
+eventq_index=0
+is_icache=false
+replacement_policy=system.cp_cntrl0.L1D1cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=65536
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=1
+
+[system.cp_cntrl0.L1D1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=2
+block_size=64
+eventq_index=0
+size=65536
+
+[system.cp_cntrl0.L1Icache]
+type=RubyCache
+children=replacement_policy
+assoc=2
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=1
+eventq_index=0
+is_icache=false
+replacement_policy=system.cp_cntrl0.L1Icache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=32768
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=1
+
+[system.cp_cntrl0.L1Icache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=2
+block_size=64
+eventq_index=0
+size=32768
+
+[system.cp_cntrl0.L2cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=1
+eventq_index=0
+is_icache=false
+replacement_policy=system.cp_cntrl0.L2cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=2097152
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=1
+
+[system.cp_cntrl0.L2cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=2097152
+
+[system.cp_cntrl0.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.cp_cntrl0.probeToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[3]
+
+[system.cp_cntrl0.requestFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[2]
+
+[system.cp_cntrl0.responseFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[3]
+
+[system.cp_cntrl0.responseToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[4]
+
+[system.cp_cntrl0.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=0
+dcache=system.cp_cntrl0.L1D0cache
+dcache_hit_latency=2
+deadlock_threshold=500000
+eventq_index=0
+icache=system.cp_cntrl0.L1Icache
+icache_hit_latency=2
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=0
+master=system.cpu0.interrupts.pio system.cpu0.interrupts.int_slave
+mem_master_port=system.piobus.slave[0]
+slave=system.cpu0.icache_port system.cpu0.dcache_port system.cpu0.itb.walker.port system.cpu0.dtb.walker.port system.cpu0.interrupts.int_master
+
+[system.cp_cntrl0.sequencer1]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=1
+dcache=system.cp_cntrl0.L1D1cache
+dcache_hit_latency=2
+deadlock_threshold=500000
+eventq_index=0
+icache=system.cp_cntrl0.L1Icache
+icache_hit_latency=2
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=1
+
+[system.cp_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.cp_cntrl0.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[4]
+
+[system.cpu0]
+type=TimingSimpleCPU
+children=apic_clk_domain clk_domain dtb interrupts isa itb tracer workload
+branchPred=Null
+checker=Null
+clk_domain=system.cpu0.clk_domain
+cpu_id=0
+do_checkpoint_insts=true
+do_quiesce=true
+do_statistics_insts=true
+dtb=system.cpu0.dtb
+eventq_index=0
+function_trace=false
+function_trace_start=0
+interrupts=system.cpu0.interrupts
+isa=system.cpu0.isa
+itb=system.cpu0.itb
+max_insts_all_threads=0
+max_insts_any_thread=0
+max_loads_all_threads=0
+max_loads_any_thread=0
+numThreads=1
+profile=0
+progress_interval=0
+simpoint_start_insts=
+socket_id=0
+switched_out=false
+system=system
+tracer=system.cpu0.tracer
+workload=system.cpu0.workload
+dcache_port=system.cp_cntrl0.sequencer.slave[1]
+icache_port=system.cp_cntrl0.sequencer.slave[0]
+
+[system.cpu0.apic_clk_domain]
+type=DerivedClockDomain
+clk_divider=16
+clk_domain=system.cpu0.clk_domain
+eventq_index=0
+
+[system.cpu0.clk_domain]
+type=SrcClockDomain
+clock=500
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.voltage_domain
+
+[system.cpu0.dtb]
+type=X86TLB
+children=walker
+eventq_index=0
+size=64
+walker=system.cpu0.dtb.walker
+
+[system.cpu0.dtb.walker]
+type=X86PagetableWalker
+clk_domain=system.cpu0.clk_domain
+eventq_index=0
+num_squash_per_cycle=4
+system=system
+port=system.cp_cntrl0.sequencer.slave[3]
+
+[system.cpu0.interrupts]
+type=X86LocalApic
+clk_domain=system.cpu0.apic_clk_domain
+eventq_index=0
+int_latency=1000
+pio_addr=2305843009213693952
+pio_latency=100000
+system=system
+int_master=system.cp_cntrl0.sequencer.slave[4]
+int_slave=system.cp_cntrl0.sequencer.master[1]
+pio=system.cp_cntrl0.sequencer.master[0]
+
+[system.cpu0.isa]
+type=X86ISA
+eventq_index=0
+
+[system.cpu0.itb]
+type=X86TLB
+children=walker
+eventq_index=0
+size=64
+walker=system.cpu0.itb.walker
+
+[system.cpu0.itb.walker]
+type=X86PagetableWalker
+clk_domain=system.cpu0.clk_domain
+eventq_index=0
+num_squash_per_cycle=4
+system=system
+port=system.cp_cntrl0.sequencer.slave[2]
+
+[system.cpu0.tracer]
+type=ExeTracer
+eventq_index=0
+
+[system.cpu0.workload]
+type=LiveProcess
+cmd=gpu-hello
+cwd=
+drivers=system.cpu2.cl_driver
+egid=100
+env=
+errout=cerr
+euid=100
+eventq_index=0
+executable=/dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello
+gid=100
+input=cin
+kvmInSE=false
+max_stack_size=67108864
+output=cout
+pid=100
+ppid=99
+simpoint=0
+system=system
+uid=100
+useArchPT=false
+
+[system.cpu1]
+type=Shader
+children=CUs0 CUs1 clk_domain
+CUs=system.cpu1.CUs0 system.cpu1.CUs1
+clk_domain=system.cpu1.clk_domain
+cpu_pointer=system.cpu0
+eventq_index=0
+globalmem=65536
+impl_kern_boundary_sync=false
+n_wf=8
+separate_acquire_release=false
+timing=true
+translation=false
+
+[system.cpu1.CUs0]
+type=ComputeUnit
+children=ldsBus localDataStore vector_register_file0 vector_register_file1 vector_register_file2 vector_register_file3 wavefronts00 wavefronts01 wavefronts02 wavefronts03 wavefronts04 wavefronts05 wavefronts06 wavefronts07 wavefronts08 wavefronts09 wavefronts10 wavefronts11 wavefronts12 wavefronts13 wavefronts14 wavefronts15 wavefronts16 wavefronts17 wavefronts18 wavefronts19 wavefronts20 wavefronts21 wavefronts22 wavefronts23 wavefronts24 wavefronts25 wavefronts26 wavefronts27 wavefronts28 wavefronts29 wavefronts30 wavefronts31
+clk_domain=system.cpu1.clk_domain
+coalescer_to_vrf_bus_width=32
+countPages=false
+cu_id=0
+debugSegFault=false
+dpbypass_pipe_length=4
+eventq_index=0
+execPolicy=OLDEST-FIRST
+functionalTLB=true
+global_mem_queue_size=256
+issue_period=4
+localDataStore=system.cpu1.CUs0.localDataStore
+localMemBarrier=false
+local_mem_queue_size=256
+mem_req_latency=9
+mem_resp_latency=9
+n_wf=8
+num_SIMDs=4
+num_global_mem_pipes=1
+num_shared_mem_pipes=1
+perLaneTLB=false
+prefetch_depth=0
+prefetch_prev_type=PF_PHASE
+prefetch_stride=1
+spbypass_pipe_length=4
+system=system
+vector_register_file=system.cpu1.CUs0.vector_register_file0 system.cpu1.CUs0.vector_register_file1 system.cpu1.CUs0.vector_register_file2 system.cpu1.CUs0.vector_register_file3
+vrf_to_coalescer_bus_width=32
+wavefronts=system.cpu1.CUs0.wavefronts00 system.cpu1.CUs0.wavefronts01 system.cpu1.CUs0.wavefronts02 system.cpu1.CUs0.wavefronts03 system.cpu1.CUs0.wavefronts04 system.cpu1.CUs0.wavefronts05 system.cpu1.CUs0.wavefronts06 system.cpu1.CUs0.wavefronts07 system.cpu1.CUs0.wavefronts08 system.cpu1.CUs0.wavefronts09 system.cpu1.CUs0.wavefronts10 system.cpu1.CUs0.wavefronts11 system.cpu1.CUs0.wavefronts12 system.cpu1.CUs0.wavefronts13 system.cpu1.CUs0.wavefronts14 system.cpu1.CUs0.wavefronts15 system.cpu1.CUs0.wavefronts16 system.cpu1.CUs0.wavefronts17 system.cpu1.CUs0.wavefronts18 system.cpu1.CUs0.wavefronts19 system.cpu1.CUs0.wavefronts20 system.cpu1.CUs0.wavefronts21 system.cpu1.CUs0.wavefronts22 system.cpu1.CUs0.wavefronts23 system.cpu1.CUs0.wavefronts24 system.cpu1.CUs0.wavefronts25 system.cpu1.CUs0.wavefronts26 system.cpu1.CUs0.wavefronts27 system.cpu1.CUs0.wavefronts28 system.cpu1.CUs0.wavefronts29 system.cpu1.CUs0.wavefronts30 system.cpu1.CUs0.wavefronts31
+wfSize=64
+xactCasMode=false
+ldsPort=system.cpu1.CUs0.ldsBus.slave
+memory_port=system.tcp_cntrl0.coalescer.slave[0] system.tcp_cntrl0.coalescer.slave[1] system.tcp_cntrl0.coalescer.slave[2] system.tcp_cntrl0.coalescer.slave[3] system.tcp_cntrl0.coalescer.slave[4] system.tcp_cntrl0.coalescer.slave[5] system.tcp_cntrl0.coalescer.slave[6] system.tcp_cntrl0.coalescer.slave[7] system.tcp_cntrl0.coalescer.slave[8] system.tcp_cntrl0.coalescer.slave[9] system.tcp_cntrl0.coalescer.slave[10] system.tcp_cntrl0.coalescer.slave[11] system.tcp_cntrl0.coalescer.slave[12] system.tcp_cntrl0.coalescer.slave[13] system.tcp_cntrl0.coalescer.slave[14] system.tcp_cntrl0.coalescer.slave[15] system.tcp_cntrl0.coalescer.slave[16] system.tcp_cntrl0.coalescer.slave[17] system.tcp_cntrl0.coalescer.slave[18] system.tcp_cntrl0.coalescer.slave[19] system.tcp_cntrl0.coalescer.slave[20] system.tcp_cntrl0.coalescer.slave[21] system.tcp_cntrl0.coalescer.slave[22] system.tcp_cntrl0.coalescer.slave[23] system.tcp_cntrl0.coalescer.slave[24] system.tcp_cntrl0.coalescer.slave[25] system.tcp_cntrl0.coalescer.slave[26] system.tcp_cntrl0.coalescer.slave[27] system.tcp_cntrl0.coalescer.slave[28] system.tcp_cntrl0.coalescer.slave[29] system.tcp_cntrl0.coalescer.slave[30] system.tcp_cntrl0.coalescer.slave[31] system.tcp_cntrl0.coalescer.slave[32] system.tcp_cntrl0.coalescer.slave[33] system.tcp_cntrl0.coalescer.slave[34] system.tcp_cntrl0.coalescer.slave[35] system.tcp_cntrl0.coalescer.slave[36] system.tcp_cntrl0.coalescer.slave[37] system.tcp_cntrl0.coalescer.slave[38] system.tcp_cntrl0.coalescer.slave[39] system.tcp_cntrl0.coalescer.slave[40] system.tcp_cntrl0.coalescer.slave[41] system.tcp_cntrl0.coalescer.slave[42] system.tcp_cntrl0.coalescer.slave[43] system.tcp_cntrl0.coalescer.slave[44] system.tcp_cntrl0.coalescer.slave[45] system.tcp_cntrl0.coalescer.slave[46] system.tcp_cntrl0.coalescer.slave[47] system.tcp_cntrl0.coalescer.slave[48] system.tcp_cntrl0.coalescer.slave[49] system.tcp_cntrl0.coalescer.slave[50] system.tcp_cntrl0.coalescer.slave[51] system.tcp_cntrl0.coalescer.slave[52] system.tcp_cntrl0.coalescer.slave[53] system.tcp_cntrl0.coalescer.slave[54] system.tcp_cntrl0.coalescer.slave[55] system.tcp_cntrl0.coalescer.slave[56] system.tcp_cntrl0.coalescer.slave[57] system.tcp_cntrl0.coalescer.slave[58] system.tcp_cntrl0.coalescer.slave[59] system.tcp_cntrl0.coalescer.slave[60] system.tcp_cntrl0.coalescer.slave[61] system.tcp_cntrl0.coalescer.slave[62] system.tcp_cntrl0.coalescer.slave[63]
+sqc_port=system.sqc_cntrl0.sequencer.slave[0]
+sqc_tlb_port=system.sqc_coalescer.slave[0]
+translation_port=system.l1_coalescer0.slave[0]
+
+[system.cpu1.CUs0.ldsBus]
+type=Bridge
+clk_domain=system.cpu1.clk_domain
+delay=0
+eventq_index=0
+ranges=0:18446744073709551615
+req_size=16
+resp_size=16
+master=system.cpu1.CUs0.localDataStore.cuPort
+slave=system.cpu1.CUs0.ldsPort
+
+[system.cpu1.CUs0.localDataStore]
+type=LdsState
+bankConflictPenalty=1
+banks=32
+clk_domain=system.cpu1.clk_domain
+eventq_index=0
+range=0:65535
+size=65536
+cuPort=system.cpu1.CUs0.ldsBus.master
+
+[system.cpu1.CUs0.vector_register_file0]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=0
+
+[system.cpu1.CUs0.vector_register_file1]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=1
+
+[system.cpu1.CUs0.vector_register_file2]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=2
+
+[system.cpu1.CUs0.vector_register_file3]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=3
+
+[system.cpu1.CUs0.wavefronts00]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=0
+
+[system.cpu1.CUs0.wavefronts01]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=1
+
+[system.cpu1.CUs0.wavefronts02]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=2
+
+[system.cpu1.CUs0.wavefronts03]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=3
+
+[system.cpu1.CUs0.wavefronts04]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=4
+
+[system.cpu1.CUs0.wavefronts05]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=5
+
+[system.cpu1.CUs0.wavefronts06]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=6
+
+[system.cpu1.CUs0.wavefronts07]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=7
+
+[system.cpu1.CUs0.wavefronts08]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=0
+
+[system.cpu1.CUs0.wavefronts09]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=1
+
+[system.cpu1.CUs0.wavefronts10]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=2
+
+[system.cpu1.CUs0.wavefronts11]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=3
+
+[system.cpu1.CUs0.wavefronts12]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=4
+
+[system.cpu1.CUs0.wavefronts13]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=5
+
+[system.cpu1.CUs0.wavefronts14]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=6
+
+[system.cpu1.CUs0.wavefronts15]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=7
+
+[system.cpu1.CUs0.wavefronts16]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=0
+
+[system.cpu1.CUs0.wavefronts17]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=1
+
+[system.cpu1.CUs0.wavefronts18]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=2
+
+[system.cpu1.CUs0.wavefronts19]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=3
+
+[system.cpu1.CUs0.wavefronts20]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=4
+
+[system.cpu1.CUs0.wavefronts21]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=5
+
+[system.cpu1.CUs0.wavefronts22]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=6
+
+[system.cpu1.CUs0.wavefronts23]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=7
+
+[system.cpu1.CUs0.wavefronts24]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=0
+
+[system.cpu1.CUs0.wavefronts25]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=1
+
+[system.cpu1.CUs0.wavefronts26]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=2
+
+[system.cpu1.CUs0.wavefronts27]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=3
+
+[system.cpu1.CUs0.wavefronts28]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=4
+
+[system.cpu1.CUs0.wavefronts29]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=5
+
+[system.cpu1.CUs0.wavefronts30]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=6
+
+[system.cpu1.CUs0.wavefronts31]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=7
+
+[system.cpu1.CUs1]
+type=ComputeUnit
+children=ldsBus localDataStore vector_register_file0 vector_register_file1 vector_register_file2 vector_register_file3 wavefronts00 wavefronts01 wavefronts02 wavefronts03 wavefronts04 wavefronts05 wavefronts06 wavefronts07 wavefronts08 wavefronts09 wavefronts10 wavefronts11 wavefronts12 wavefronts13 wavefronts14 wavefronts15 wavefronts16 wavefronts17 wavefronts18 wavefronts19 wavefronts20 wavefronts21 wavefronts22 wavefronts23 wavefronts24 wavefronts25 wavefronts26 wavefronts27 wavefronts28 wavefronts29 wavefronts30 wavefronts31
+clk_domain=system.cpu1.clk_domain
+coalescer_to_vrf_bus_width=32
+countPages=false
+cu_id=1
+debugSegFault=false
+dpbypass_pipe_length=4
+eventq_index=0
+execPolicy=OLDEST-FIRST
+functionalTLB=true
+global_mem_queue_size=256
+issue_period=4
+localDataStore=system.cpu1.CUs1.localDataStore
+localMemBarrier=false
+local_mem_queue_size=256
+mem_req_latency=9
+mem_resp_latency=9
+n_wf=8
+num_SIMDs=4
+num_global_mem_pipes=1
+num_shared_mem_pipes=1
+perLaneTLB=false
+prefetch_depth=0
+prefetch_prev_type=PF_PHASE
+prefetch_stride=1
+spbypass_pipe_length=4
+system=system
+vector_register_file=system.cpu1.CUs1.vector_register_file0 system.cpu1.CUs1.vector_register_file1 system.cpu1.CUs1.vector_register_file2 system.cpu1.CUs1.vector_register_file3
+vrf_to_coalescer_bus_width=32
+wavefronts=system.cpu1.CUs1.wavefronts00 system.cpu1.CUs1.wavefronts01 system.cpu1.CUs1.wavefronts02 system.cpu1.CUs1.wavefronts03 system.cpu1.CUs1.wavefronts04 system.cpu1.CUs1.wavefronts05 system.cpu1.CUs1.wavefronts06 system.cpu1.CUs1.wavefronts07 system.cpu1.CUs1.wavefronts08 system.cpu1.CUs1.wavefronts09 system.cpu1.CUs1.wavefronts10 system.cpu1.CUs1.wavefronts11 system.cpu1.CUs1.wavefronts12 system.cpu1.CUs1.wavefronts13 system.cpu1.CUs1.wavefronts14 system.cpu1.CUs1.wavefronts15 system.cpu1.CUs1.wavefronts16 system.cpu1.CUs1.wavefronts17 system.cpu1.CUs1.wavefronts18 system.cpu1.CUs1.wavefronts19 system.cpu1.CUs1.wavefronts20 system.cpu1.CUs1.wavefronts21 system.cpu1.CUs1.wavefronts22 system.cpu1.CUs1.wavefronts23 system.cpu1.CUs1.wavefronts24 system.cpu1.CUs1.wavefronts25 system.cpu1.CUs1.wavefronts26 system.cpu1.CUs1.wavefronts27 system.cpu1.CUs1.wavefronts28 system.cpu1.CUs1.wavefronts29 system.cpu1.CUs1.wavefronts30 system.cpu1.CUs1.wavefronts31
+wfSize=64
+xactCasMode=false
+ldsPort=system.cpu1.CUs1.ldsBus.slave
+memory_port=system.tcp_cntrl1.coalescer.slave[0] system.tcp_cntrl1.coalescer.slave[1] system.tcp_cntrl1.coalescer.slave[2] system.tcp_cntrl1.coalescer.slave[3] system.tcp_cntrl1.coalescer.slave[4] system.tcp_cntrl1.coalescer.slave[5] system.tcp_cntrl1.coalescer.slave[6] system.tcp_cntrl1.coalescer.slave[7] system.tcp_cntrl1.coalescer.slave[8] system.tcp_cntrl1.coalescer.slave[9] system.tcp_cntrl1.coalescer.slave[10] system.tcp_cntrl1.coalescer.slave[11] system.tcp_cntrl1.coalescer.slave[12] system.tcp_cntrl1.coalescer.slave[13] system.tcp_cntrl1.coalescer.slave[14] system.tcp_cntrl1.coalescer.slave[15] system.tcp_cntrl1.coalescer.slave[16] system.tcp_cntrl1.coalescer.slave[17] system.tcp_cntrl1.coalescer.slave[18] system.tcp_cntrl1.coalescer.slave[19] system.tcp_cntrl1.coalescer.slave[20] system.tcp_cntrl1.coalescer.slave[21] system.tcp_cntrl1.coalescer.slave[22] system.tcp_cntrl1.coalescer.slave[23] system.tcp_cntrl1.coalescer.slave[24] system.tcp_cntrl1.coalescer.slave[25] system.tcp_cntrl1.coalescer.slave[26] system.tcp_cntrl1.coalescer.slave[27] system.tcp_cntrl1.coalescer.slave[28] system.tcp_cntrl1.coalescer.slave[29] system.tcp_cntrl1.coalescer.slave[30] system.tcp_cntrl1.coalescer.slave[31] system.tcp_cntrl1.coalescer.slave[32] system.tcp_cntrl1.coalescer.slave[33] system.tcp_cntrl1.coalescer.slave[34] system.tcp_cntrl1.coalescer.slave[35] system.tcp_cntrl1.coalescer.slave[36] system.tcp_cntrl1.coalescer.slave[37] system.tcp_cntrl1.coalescer.slave[38] system.tcp_cntrl1.coalescer.slave[39] system.tcp_cntrl1.coalescer.slave[40] system.tcp_cntrl1.coalescer.slave[41] system.tcp_cntrl1.coalescer.slave[42] system.tcp_cntrl1.coalescer.slave[43] system.tcp_cntrl1.coalescer.slave[44] system.tcp_cntrl1.coalescer.slave[45] system.tcp_cntrl1.coalescer.slave[46] system.tcp_cntrl1.coalescer.slave[47] system.tcp_cntrl1.coalescer.slave[48] system.tcp_cntrl1.coalescer.slave[49] system.tcp_cntrl1.coalescer.slave[50] system.tcp_cntrl1.coalescer.slave[51] system.tcp_cntrl1.coalescer.slave[52] system.tcp_cntrl1.coalescer.slave[53] system.tcp_cntrl1.coalescer.slave[54] system.tcp_cntrl1.coalescer.slave[55] system.tcp_cntrl1.coalescer.slave[56] system.tcp_cntrl1.coalescer.slave[57] system.tcp_cntrl1.coalescer.slave[58] system.tcp_cntrl1.coalescer.slave[59] system.tcp_cntrl1.coalescer.slave[60] system.tcp_cntrl1.coalescer.slave[61] system.tcp_cntrl1.coalescer.slave[62] system.tcp_cntrl1.coalescer.slave[63]
+sqc_port=system.sqc_cntrl0.sequencer.slave[1]
+sqc_tlb_port=system.sqc_coalescer.slave[1]
+translation_port=system.l1_coalescer1.slave[0]
+
+[system.cpu1.CUs1.ldsBus]
+type=Bridge
+clk_domain=system.cpu1.clk_domain
+delay=0
+eventq_index=0
+ranges=0:18446744073709551615
+req_size=16
+resp_size=16
+master=system.cpu1.CUs1.localDataStore.cuPort
+slave=system.cpu1.CUs1.ldsPort
+
+[system.cpu1.CUs1.localDataStore]
+type=LdsState
+bankConflictPenalty=1
+banks=32
+clk_domain=system.cpu1.clk_domain
+eventq_index=0
+range=0:65535
+size=65536
+cuPort=system.cpu1.CUs1.ldsBus.master
+
+[system.cpu1.CUs1.vector_register_file0]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=0
+
+[system.cpu1.CUs1.vector_register_file1]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=1
+
+[system.cpu1.CUs1.vector_register_file2]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=2
+
+[system.cpu1.CUs1.vector_register_file3]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=3
+
+[system.cpu1.CUs1.wavefronts00]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=0
+
+[system.cpu1.CUs1.wavefronts01]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=1
+
+[system.cpu1.CUs1.wavefronts02]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=2
+
+[system.cpu1.CUs1.wavefronts03]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=3
+
+[system.cpu1.CUs1.wavefronts04]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=4
+
+[system.cpu1.CUs1.wavefronts05]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=5
+
+[system.cpu1.CUs1.wavefronts06]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=6
+
+[system.cpu1.CUs1.wavefronts07]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=7
+
+[system.cpu1.CUs1.wavefronts08]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=0
+
+[system.cpu1.CUs1.wavefronts09]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=1
+
+[system.cpu1.CUs1.wavefronts10]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=2
+
+[system.cpu1.CUs1.wavefronts11]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=3
+
+[system.cpu1.CUs1.wavefronts12]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=4
+
+[system.cpu1.CUs1.wavefronts13]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=5
+
+[system.cpu1.CUs1.wavefronts14]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=6
+
+[system.cpu1.CUs1.wavefronts15]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=7
+
+[system.cpu1.CUs1.wavefronts16]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=0
+
+[system.cpu1.CUs1.wavefronts17]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=1
+
+[system.cpu1.CUs1.wavefronts18]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=2
+
+[system.cpu1.CUs1.wavefronts19]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=3
+
+[system.cpu1.CUs1.wavefronts20]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=4
+
+[system.cpu1.CUs1.wavefronts21]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=5
+
+[system.cpu1.CUs1.wavefronts22]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=6
+
+[system.cpu1.CUs1.wavefronts23]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=7
+
+[system.cpu1.CUs1.wavefronts24]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=0
+
+[system.cpu1.CUs1.wavefronts25]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=1
+
+[system.cpu1.CUs1.wavefronts26]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=2
+
+[system.cpu1.CUs1.wavefronts27]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=3
+
+[system.cpu1.CUs1.wavefronts28]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=4
+
+[system.cpu1.CUs1.wavefronts29]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=5
+
+[system.cpu1.CUs1.wavefronts30]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=6
+
+[system.cpu1.CUs1.wavefronts31]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=7
+
+[system.cpu1.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.cpu1.clk_domain.voltage_domain
+
+[system.cpu1.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.cpu2]
+type=GpuDispatcher
+children=cl_driver
+cl_driver=system.cpu2.cl_driver
+clk_domain=system.clk_domain
+cpu=system.cpu0
+eventq_index=0
+pio_addr=8589934592
+pio_latency=1000
+shader_pointer=system.cpu1
+system=system
+dma=system.piobus.slave[1]
+pio=system.piobus.master[0]
+translation_port=system.dispatcher_coalescer.slave[0]
+
+[system.cpu2.cl_driver]
+type=ClDriver
+codefile=/dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm
+eventq_index=0
+filename=hsa
+
+[system.dir_cntrl0]
+type=Directory_Controller
+children=L3CacheMemory L3triggerQueue directory probeToCore requestFromCores responseFromCores responseFromMemory responseToCore triggerQueue unblockFromCores
+CPUonly=false
+L3CacheMemory=system.dir_cntrl0.L3CacheMemory
+L3triggerQueue=system.dir_cntrl0.L3triggerQueue
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+directory=system.dir_cntrl0.directory
+eventq_index=0
+l3_hit_latency=15
+noTCCdir=false
+number_of_TBEs=5120
+probeToCore=system.dir_cntrl0.probeToCore
+recycle_latency=10
+requestFromCores=system.dir_cntrl0.requestFromCores
+responseFromCores=system.dir_cntrl0.responseFromCores
+responseFromMemory=system.dir_cntrl0.responseFromMemory
+responseToCore=system.dir_cntrl0.responseToCore
+response_latency=30
+ruby_system=system.ruby
+system=system
+to_memory_controller_latency=1
+transitions_per_cycle=32
+triggerQueue=system.dir_cntrl0.triggerQueue
+unblockFromCores=system.dir_cntrl0.unblockFromCores
+useL3OnWT=false
+version=0
+memory=system.mem_ctrls.port
+
+[system.dir_cntrl0.L3CacheMemory]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=20
+dataArrayBanks=256.0
+eventq_index=0
+is_icache=false
+replacement_policy=system.dir_cntrl0.L3CacheMemory.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=16777216
+start_index_bit=6
+tagAccessLatency=15
+tagArrayBanks=256.0
+
+[system.dir_cntrl0.L3CacheMemory.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=16777216
+
+[system.dir_cntrl0.L3triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.dir_cntrl0.directory]
+type=RubyDirectoryMemory
+eventq_index=0
+numa_high_bit=5
+size=536870912
+version=0
+
+[system.dir_cntrl0.probeToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[0]
+
+[system.dir_cntrl0.requestFromCores]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[0]
+
+[system.dir_cntrl0.responseFromCores]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[1]
+
+[system.dir_cntrl0.responseFromMemory]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.dir_cntrl0.responseToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[1]
+
+[system.dir_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.dir_cntrl0.unblockFromCores]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[2]
+
+[system.dispatcher_coalescer]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.dispatcher_coalescer.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.dispatcher_tlb.slave[0]
+slave=system.cpu2.translation_port
+
+[system.dispatcher_coalescer.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.dispatcher_coalescer.clk_domain.voltage_domain
+
+[system.dispatcher_coalescer.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.dispatcher_tlb]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.dispatcher_tlb.clk_domain
+eventq_index=0
+hitLatency=1
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=32
+master=system.l2_coalescer.slave[1]
+slave=system.dispatcher_coalescer.master[0]
+
+[system.dispatcher_tlb.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.dispatcher_tlb.clk_domain.voltage_domain
+
+[system.dispatcher_tlb.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.dvfs_handler]
+type=DVFSHandler
+domains=
+enable=false
+eventq_index=0
+sys_clk_domain=system.clk_domain
+transition_latency=100000000
+
+[system.l1_coalescer0]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.l1_coalescer0.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.l1_tlb0.slave[0]
+slave=system.cpu1.CUs0.translation_port[0]
+
+[system.l1_coalescer0.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l1_coalescer0.clk_domain.voltage_domain
+
+[system.l1_coalescer0.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l1_coalescer1]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.l1_coalescer1.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.l1_tlb1.slave[0]
+slave=system.cpu1.CUs1.translation_port[0]
+
+[system.l1_coalescer1.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l1_coalescer1.clk_domain.voltage_domain
+
+[system.l1_coalescer1.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l1_tlb0]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.l1_tlb0.clk_domain
+eventq_index=0
+hitLatency=1
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=32
+master=system.l2_coalescer.slave[2]
+slave=system.l1_coalescer0.master[0]
+
+[system.l1_tlb0.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l1_tlb0.clk_domain.voltage_domain
+
+[system.l1_tlb0.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l1_tlb1]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.l1_tlb1.clk_domain
+eventq_index=0
+hitLatency=1
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=32
+master=system.l2_coalescer.slave[3]
+slave=system.l1_coalescer1.master[0]
+
+[system.l1_tlb1.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l1_tlb1.clk_domain.voltage_domain
+
+[system.l1_tlb1.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l2_coalescer]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.l2_coalescer.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.l2_tlb.slave[0]
+slave=system.sqc_tlb.master[0] system.dispatcher_tlb.master[0] system.l1_tlb0.master[0] system.l1_tlb1.master[0]
+
+[system.l2_coalescer.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l2_coalescer.clk_domain.voltage_domain
+
+[system.l2_coalescer.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l2_tlb]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.l2_tlb.clk_domain
+eventq_index=0
+hitLatency=69
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=4096
+master=system.l3_coalescer.slave[0]
+slave=system.l2_coalescer.master[0]
+
+[system.l2_tlb.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l2_tlb.clk_domain.voltage_domain
+
+[system.l2_tlb.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l3_coalescer]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.l3_coalescer.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.l3_tlb.slave[0]
+slave=system.l2_tlb.master[0]
+
+[system.l3_coalescer.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l3_coalescer.clk_domain.voltage_domain
+
+[system.l3_coalescer.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l3_tlb]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.l3_tlb.clk_domain
+eventq_index=0
+hitLatency=150
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=8192
+slave=system.l3_coalescer.master[0]
+
+[system.l3_tlb.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l3_tlb.clk_domain.voltage_domain
+
+[system.l3_tlb.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.mem_ctrls]
+type=DRAMCtrl
+IDD0=0.075000
+IDD02=0.000000
+IDD2N=0.050000
+IDD2N2=0.000000
+IDD2P0=0.000000
+IDD2P02=0.000000
+IDD2P1=0.000000
+IDD2P12=0.000000
+IDD3N=0.057000
+IDD3N2=0.000000
+IDD3P0=0.000000
+IDD3P02=0.000000
+IDD3P1=0.000000
+IDD3P12=0.000000
+IDD4R=0.187000
+IDD4R2=0.000000
+IDD4W=0.165000
+IDD4W2=0.000000
+IDD5=0.220000
+IDD52=0.000000
+IDD6=0.000000
+IDD62=0.000000
+VDD=1.500000
+VDD2=0.000000
+activation_limit=4
+addr_mapping=RoRaBaCoCh
+bank_groups_per_rank=0
+banks_per_rank=8
+burst_length=8
+channels=1
+clk_domain=system.clk_domain
+conf_table_reported=true
+device_bus_width=8
+device_rowbuffer_size=1024
+device_size=536870912
+devices_per_rank=8
+dll=true
+eventq_index=0
+in_addr_map=true
+max_accesses_per_row=16
+mem_sched_policy=frfcfs
+min_writes_per_switch=16
+null=false
+page_policy=open_adaptive
+range=0:536870911
+ranks_per_channel=2
+read_buffer_size=32
+static_backend_latency=10000
+static_frontend_latency=10000
+tBURST=5000
+tCCD_L=0
+tCK=1250
+tCL=13750
+tCS=2500
+tRAS=35000
+tRCD=13750
+tREFI=7800000
+tRFC=260000
+tRP=13750
+tRRD=6000
+tRRD_L=0
+tRTP=7500
+tRTW=2500
+tWR=15000
+tWTR=7500
+tXAW=30000
+tXP=0
+tXPDLL=0
+tXS=0
+tXSDLL=0
+write_buffer_size=64
+write_high_thresh_perc=85
+write_low_thresh_perc=50
+port=system.dir_cntrl0.memory
+
+[system.piobus]
+type=NoncoherentXBar
+clk_domain=system.clk_domain
+eventq_index=0
+forward_latency=0
+frontend_latency=0
+response_latency=0
+use_default_range=false
+width=32
+master=system.cpu2.pio
+slave=system.cp_cntrl0.sequencer.mem_master_port system.cpu2.dma
+
+[system.ruby]
+type=RubySystem
+children=clk_domain network phys_mem
+access_backing_store=true
+all_instructions=false
+block_size_bytes=64
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+hot_lines=false
+memory_size_bits=48
+num_of_sequencers=5
+number_of_virtual_networks=10
+phys_mem=system.ruby.phys_mem
+randomization=false
+
+[system.ruby.clk_domain]
+type=SrcClockDomain
+clock=500
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.voltage_domain
+
+[system.ruby.network]
+type=SimpleNetwork
+children=ext_links0 ext_links1 ext_links2 ext_links3 ext_links4 ext_links5 ext_links6 int_link_buffers00 int_link_buffers01 int_link_buffers02 int_link_buffers03 int_link_buffers04 int_link_buffers05 int_link_buffers06 int_link_buffers07 int_link_buffers08 int_link_buffers09 int_link_buffers10 int_link_buffers11 int_link_buffers12 int_link_buffers13 int_link_buffers14 int_link_buffers15 int_link_buffers16 int_link_buffers17 int_link_buffers18 int_link_buffers19 int_link_buffers20 int_link_buffers21 int_link_buffers22 int_link_buffers23 int_link_buffers24 int_link_buffers25 int_link_buffers26 int_link_buffers27 int_link_buffers28 int_link_buffers29 int_link_buffers30 int_link_buffers31 int_link_buffers32 int_link_buffers33 int_link_buffers34 int_link_buffers35 int_link_buffers36 int_link_buffers37 int_link_buffers38 int_link_buffers39 int_links0 int_links1
+adaptive_routing=false
+buffer_size=0
+clk_domain=system.ruby.clk_domain
+control_msg_size=8
+endpoint_bandwidth=1000
+eventq_index=0
+ext_links=system.ruby.network.ext_links0 system.ruby.network.ext_links1 system.ruby.network.ext_links2 system.ruby.network.ext_links3 system.ruby.network.ext_links4 system.ruby.network.ext_links5 system.ruby.network.ext_links6
+int_link_buffers=system.ruby.network.int_link_buffers00 system.ruby.network.int_link_buffers01 system.ruby.network.int_link_buffers02 system.ruby.network.int_link_buffers03 system.ruby.network.int_link_buffers04 system.ruby.network.int_link_buffers05 system.ruby.network.int_link_buffers06 system.ruby.network.int_link_buffers07 system.ruby.network.int_link_buffers08 system.ruby.network.int_link_buffers09 system.ruby.network.int_link_buffers10 system.ruby.network.int_link_buffers11 system.ruby.network.int_link_buffers12 system.ruby.network.int_link_buffers13 system.ruby.network.int_link_buffers14 system.ruby.network.int_link_buffers15 system.ruby.network.int_link_buffers16 system.ruby.network.int_link_buffers17 system.ruby.network.int_link_buffers18 system.ruby.network.int_link_buffers19 system.ruby.network.int_link_buffers20 system.ruby.network.int_link_buffers21 system.ruby.network.int_link_buffers22 system.ruby.network.int_link_buffers23 system.ruby.network.int_link_buffers24 system.ruby.network.int_link_buffers25 system.ruby.network.int_link_buffers26 system.ruby.network.int_link_buffers27 system.ruby.network.int_link_buffers28 system.ruby.network.int_link_buffers29 system.ruby.network.int_link_buffers30 system.ruby.network.int_link_buffers31 system.ruby.network.int_link_buffers32 system.ruby.network.int_link_buffers33 system.ruby.network.int_link_buffers34 system.ruby.network.int_link_buffers35 system.ruby.network.int_link_buffers36 system.ruby.network.int_link_buffers37 system.ruby.network.int_link_buffers38 system.ruby.network.int_link_buffers39
+int_links=system.ruby.network.int_links0 system.ruby.network.int_links1
+netifs=
+number_of_virtual_networks=10
+routers=system.ruby.network.ext_links0.int_node system.ruby.network.ext_links1.int_node system.ruby.network.ext_links2.int_node
+ruby_system=system.ruby
+topology=Crossbar
+master=system.dir_cntrl0.requestFromCores.slave system.dir_cntrl0.responseFromCores.slave system.dir_cntrl0.unblockFromCores.slave system.cp_cntrl0.probeToCore.slave system.cp_cntrl0.responseToCore.slave system.tcp_cntrl0.probeToTCP.slave system.tcp_cntrl0.responseToTCP.slave system.tcp_cntrl1.probeToTCP.slave system.tcp_cntrl1.responseToTCP.slave system.sqc_cntrl0.probeToSQC.slave system.sqc_cntrl0.responseToSQC.slave system.tcc_cntrl0.responseToTCC.slave system.tccdir_cntrl0.requestFromTCP.slave system.tccdir_cntrl0.responseFromTCP.slave system.tccdir_cntrl0.unblockFromTCP.slave system.tccdir_cntrl0.probeFromNB.slave system.tccdir_cntrl0.responseFromNB.slave
+slave=system.dir_cntrl0.probeToCore.master system.dir_cntrl0.responseToCore.master system.cp_cntrl0.requestFromCore.master system.cp_cntrl0.responseFromCore.master system.cp_cntrl0.unblockFromCore.master system.tcp_cntrl0.requestFromTCP.master system.tcp_cntrl0.responseFromTCP.master system.tcp_cntrl0.unblockFromCore.master system.tcp_cntrl1.requestFromTCP.master system.tcp_cntrl1.responseFromTCP.master system.tcp_cntrl1.unblockFromCore.master system.sqc_cntrl0.requestFromSQC.master system.sqc_cntrl0.responseFromSQC.master system.sqc_cntrl0.unblockFromCore.master system.tcc_cntrl0.responseFromTCC.master system.tccdir_cntrl0.probeToCore.master system.tccdir_cntrl0.responseToCore.master system.tccdir_cntrl0.requestToNB.master system.tccdir_cntrl0.responseToNB.master system.tccdir_cntrl0.unblockToNB.master
+
+[system.ruby.network.ext_links0]
+type=SimpleExtLink
+children=int_node
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.dir_cntrl0
+int_node=system.ruby.network.ext_links0.int_node
+latency=1
+link_id=0
+weight=1
+
+[system.ruby.network.ext_links0.int_node]
+type=Switch
+children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 port_buffers70 port_buffers71 port_buffers72 port_buffers73 port_buffers74 port_buffers75 port_buffers76 port_buffers77 port_buffers78 port_buffers79 port_buffers80 port_buffers81 port_buffers82 port_buffers83 port_buffers84 port_buffers85 port_buffers86 port_buffers87 port_buffers88 port_buffers89
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+port_buffers=system.ruby.network.ext_links0.int_node.port_buffers00 system.ruby.network.ext_links0.int_node.port_buffers01 system.ruby.network.ext_links0.int_node.port_buffers02 system.ruby.network.ext_links0.int_node.port_buffers03 system.ruby.network.ext_links0.int_node.port_buffers04 system.ruby.network.ext_links0.int_node.port_buffers05 system.ruby.network.ext_links0.int_node.port_buffers06 system.ruby.network.ext_links0.int_node.port_buffers07 system.ruby.network.ext_links0.int_node.port_buffers08 system.ruby.network.ext_links0.int_node.port_buffers09 system.ruby.network.ext_links0.int_node.port_buffers10 system.ruby.network.ext_links0.int_node.port_buffers11 system.ruby.network.ext_links0.int_node.port_buffers12 system.ruby.network.ext_links0.int_node.port_buffers13 system.ruby.network.ext_links0.int_node.port_buffers14 system.ruby.network.ext_links0.int_node.port_buffers15 system.ruby.network.ext_links0.int_node.port_buffers16 system.ruby.network.ext_links0.int_node.port_buffers17 system.ruby.network.ext_links0.int_node.port_buffers18 system.ruby.network.ext_links0.int_node.port_buffers19 system.ruby.network.ext_links0.int_node.port_buffers20 system.ruby.network.ext_links0.int_node.port_buffers21 system.ruby.network.ext_links0.int_node.port_buffers22 system.ruby.network.ext_links0.int_node.port_buffers23 system.ruby.network.ext_links0.int_node.port_buffers24 system.ruby.network.ext_links0.int_node.port_buffers25 system.ruby.network.ext_links0.int_node.port_buffers26 system.ruby.network.ext_links0.int_node.port_buffers27 system.ruby.network.ext_links0.int_node.port_buffers28 system.ruby.network.ext_links0.int_node.port_buffers29 system.ruby.network.ext_links0.int_node.port_buffers30 system.ruby.network.ext_links0.int_node.port_buffers31 system.ruby.network.ext_links0.int_node.port_buffers32 system.ruby.network.ext_links0.int_node.port_buffers33 system.ruby.network.ext_links0.int_node.port_buffers34 system.ruby.network.ext_links0.int_node.port_buffers35 system.ruby.network.ext_links0.int_node.port_buffers36 system.ruby.network.ext_links0.int_node.port_buffers37 system.ruby.network.ext_links0.int_node.port_buffers38 system.ruby.network.ext_links0.int_node.port_buffers39 system.ruby.network.ext_links0.int_node.port_buffers40 system.ruby.network.ext_links0.int_node.port_buffers41 system.ruby.network.ext_links0.int_node.port_buffers42 system.ruby.network.ext_links0.int_node.port_buffers43 system.ruby.network.ext_links0.int_node.port_buffers44 system.ruby.network.ext_links0.int_node.port_buffers45 system.ruby.network.ext_links0.int_node.port_buffers46 system.ruby.network.ext_links0.int_node.port_buffers47 system.ruby.network.ext_links0.int_node.port_buffers48 system.ruby.network.ext_links0.int_node.port_buffers49 system.ruby.network.ext_links0.int_node.port_buffers50 system.ruby.network.ext_links0.int_node.port_buffers51 system.ruby.network.ext_links0.int_node.port_buffers52 system.ruby.network.ext_links0.int_node.port_buffers53 system.ruby.network.ext_links0.int_node.port_buffers54 system.ruby.network.ext_links0.int_node.port_buffers55 system.ruby.network.ext_links0.int_node.port_buffers56 system.ruby.network.ext_links0.int_node.port_buffers57 system.ruby.network.ext_links0.int_node.port_buffers58 system.ruby.network.ext_links0.int_node.port_buffers59 system.ruby.network.ext_links0.int_node.port_buffers60 system.ruby.network.ext_links0.int_node.port_buffers61 system.ruby.network.ext_links0.int_node.port_buffers62 system.ruby.network.ext_links0.int_node.port_buffers63 system.ruby.network.ext_links0.int_node.port_buffers64 system.ruby.network.ext_links0.int_node.port_buffers65 system.ruby.network.ext_links0.int_node.port_buffers66 system.ruby.network.ext_links0.int_node.port_buffers67 system.ruby.network.ext_links0.int_node.port_buffers68 system.ruby.network.ext_links0.int_node.port_buffers69 system.ruby.network.ext_links0.int_node.port_buffers70 system.ruby.network.ext_links0.int_node.port_buffers71 system.ruby.network.ext_links0.int_node.port_buffers72 system.ruby.network.ext_links0.int_node.port_buffers73 system.ruby.network.ext_links0.int_node.port_buffers74 system.ruby.network.ext_links0.int_node.port_buffers75 system.ruby.network.ext_links0.int_node.port_buffers76 system.ruby.network.ext_links0.int_node.port_buffers77 system.ruby.network.ext_links0.int_node.port_buffers78 system.ruby.network.ext_links0.int_node.port_buffers79 system.ruby.network.ext_links0.int_node.port_buffers80 system.ruby.network.ext_links0.int_node.port_buffers81 system.ruby.network.ext_links0.int_node.port_buffers82 system.ruby.network.ext_links0.int_node.port_buffers83 system.ruby.network.ext_links0.int_node.port_buffers84 system.ruby.network.ext_links0.int_node.port_buffers85 system.ruby.network.ext_links0.int_node.port_buffers86 system.ruby.network.ext_links0.int_node.port_buffers87 system.ruby.network.ext_links0.int_node.port_buffers88 system.ruby.network.ext_links0.int_node.port_buffers89
+router_id=0
+virt_nets=10
+
+[system.ruby.network.ext_links0.int_node.port_buffers00]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers01]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers02]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers03]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers04]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers05]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers06]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers07]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers08]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers09]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers10]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers11]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers12]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers13]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers14]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers15]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers16]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers17]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers18]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers19]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers20]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers21]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers22]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers23]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers24]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers25]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers26]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers27]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers28]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers29]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers30]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers31]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers32]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers33]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers34]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers35]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers36]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers37]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers38]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers39]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers40]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers41]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers42]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers43]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers44]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers45]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers46]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers47]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers48]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers49]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers50]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers51]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers52]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers53]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers54]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers55]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers56]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers57]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers58]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers59]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers60]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers61]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers62]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers63]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers64]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers65]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers66]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers67]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers68]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers69]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers70]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers71]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers72]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers73]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers74]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers75]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers76]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers77]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers78]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers79]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers80]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers81]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers82]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers83]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers84]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers85]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers86]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers87]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers88]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers89]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1]
+type=SimpleExtLink
+children=int_node
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.cp_cntrl0
+int_node=system.ruby.network.ext_links1.int_node
+latency=1
+link_id=1
+weight=1
+
+[system.ruby.network.ext_links1.int_node]
+type=Switch
+children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 port_buffers70 port_buffers71 port_buffers72 port_buffers73 port_buffers74 port_buffers75 port_buffers76 port_buffers77 port_buffers78 port_buffers79
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+port_buffers=system.ruby.network.ext_links1.int_node.port_buffers00 system.ruby.network.ext_links1.int_node.port_buffers01 system.ruby.network.ext_links1.int_node.port_buffers02 system.ruby.network.ext_links1.int_node.port_buffers03 system.ruby.network.ext_links1.int_node.port_buffers04 system.ruby.network.ext_links1.int_node.port_buffers05 system.ruby.network.ext_links1.int_node.port_buffers06 system.ruby.network.ext_links1.int_node.port_buffers07 system.ruby.network.ext_links1.int_node.port_buffers08 system.ruby.network.ext_links1.int_node.port_buffers09 system.ruby.network.ext_links1.int_node.port_buffers10 system.ruby.network.ext_links1.int_node.port_buffers11 system.ruby.network.ext_links1.int_node.port_buffers12 system.ruby.network.ext_links1.int_node.port_buffers13 system.ruby.network.ext_links1.int_node.port_buffers14 system.ruby.network.ext_links1.int_node.port_buffers15 system.ruby.network.ext_links1.int_node.port_buffers16 system.ruby.network.ext_links1.int_node.port_buffers17 system.ruby.network.ext_links1.int_node.port_buffers18 system.ruby.network.ext_links1.int_node.port_buffers19 system.ruby.network.ext_links1.int_node.port_buffers20 system.ruby.network.ext_links1.int_node.port_buffers21 system.ruby.network.ext_links1.int_node.port_buffers22 system.ruby.network.ext_links1.int_node.port_buffers23 system.ruby.network.ext_links1.int_node.port_buffers24 system.ruby.network.ext_links1.int_node.port_buffers25 system.ruby.network.ext_links1.int_node.port_buffers26 system.ruby.network.ext_links1.int_node.port_buffers27 system.ruby.network.ext_links1.int_node.port_buffers28 system.ruby.network.ext_links1.int_node.port_buffers29 system.ruby.network.ext_links1.int_node.port_buffers30 system.ruby.network.ext_links1.int_node.port_buffers31 system.ruby.network.ext_links1.int_node.port_buffers32 system.ruby.network.ext_links1.int_node.port_buffers33 system.ruby.network.ext_links1.int_node.port_buffers34 system.ruby.network.ext_links1.int_node.port_buffers35 system.ruby.network.ext_links1.int_node.port_buffers36 system.ruby.network.ext_links1.int_node.port_buffers37 system.ruby.network.ext_links1.int_node.port_buffers38 system.ruby.network.ext_links1.int_node.port_buffers39 system.ruby.network.ext_links1.int_node.port_buffers40 system.ruby.network.ext_links1.int_node.port_buffers41 system.ruby.network.ext_links1.int_node.port_buffers42 system.ruby.network.ext_links1.int_node.port_buffers43 system.ruby.network.ext_links1.int_node.port_buffers44 system.ruby.network.ext_links1.int_node.port_buffers45 system.ruby.network.ext_links1.int_node.port_buffers46 system.ruby.network.ext_links1.int_node.port_buffers47 system.ruby.network.ext_links1.int_node.port_buffers48 system.ruby.network.ext_links1.int_node.port_buffers49 system.ruby.network.ext_links1.int_node.port_buffers50 system.ruby.network.ext_links1.int_node.port_buffers51 system.ruby.network.ext_links1.int_node.port_buffers52 system.ruby.network.ext_links1.int_node.port_buffers53 system.ruby.network.ext_links1.int_node.port_buffers54 system.ruby.network.ext_links1.int_node.port_buffers55 system.ruby.network.ext_links1.int_node.port_buffers56 system.ruby.network.ext_links1.int_node.port_buffers57 system.ruby.network.ext_links1.int_node.port_buffers58 system.ruby.network.ext_links1.int_node.port_buffers59 system.ruby.network.ext_links1.int_node.port_buffers60 system.ruby.network.ext_links1.int_node.port_buffers61 system.ruby.network.ext_links1.int_node.port_buffers62 system.ruby.network.ext_links1.int_node.port_buffers63 system.ruby.network.ext_links1.int_node.port_buffers64 system.ruby.network.ext_links1.int_node.port_buffers65 system.ruby.network.ext_links1.int_node.port_buffers66 system.ruby.network.ext_links1.int_node.port_buffers67 system.ruby.network.ext_links1.int_node.port_buffers68 system.ruby.network.ext_links1.int_node.port_buffers69 system.ruby.network.ext_links1.int_node.port_buffers70 system.ruby.network.ext_links1.int_node.port_buffers71 system.ruby.network.ext_links1.int_node.port_buffers72 system.ruby.network.ext_links1.int_node.port_buffers73 system.ruby.network.ext_links1.int_node.port_buffers74 system.ruby.network.ext_links1.int_node.port_buffers75 system.ruby.network.ext_links1.int_node.port_buffers76 system.ruby.network.ext_links1.int_node.port_buffers77 system.ruby.network.ext_links1.int_node.port_buffers78 system.ruby.network.ext_links1.int_node.port_buffers79
+router_id=1
+virt_nets=10
+
+[system.ruby.network.ext_links1.int_node.port_buffers00]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers01]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers02]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers03]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers04]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers05]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers06]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers07]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers08]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers09]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers10]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers11]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers12]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers13]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers14]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers15]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers16]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers17]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers18]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers19]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers20]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers21]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers22]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers23]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers24]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers25]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers26]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers27]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers28]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers29]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers30]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers31]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers32]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers33]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers34]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers35]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers36]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers37]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers38]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers39]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers40]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers41]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers42]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers43]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers44]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers45]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers46]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers47]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers48]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers49]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers50]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers51]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers52]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers53]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers54]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers55]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers56]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers57]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers58]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers59]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers60]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers61]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers62]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers63]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers64]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers65]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers66]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers67]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers68]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers69]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers70]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers71]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers72]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers73]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers74]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers75]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers76]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers77]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers78]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers79]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2]
+type=SimpleExtLink
+children=int_node
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.tcp_cntrl0
+int_node=system.ruby.network.ext_links2.int_node
+latency=1
+link_id=2
+weight=1
+
+[system.ruby.network.ext_links2.int_node]
+type=Switch
+children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 port_buffers70 port_buffers71 port_buffers72 port_buffers73 port_buffers74 port_buffers75 port_buffers76 port_buffers77 port_buffers78 port_buffers79
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+port_buffers=system.ruby.network.ext_links2.int_node.port_buffers00 system.ruby.network.ext_links2.int_node.port_buffers01 system.ruby.network.ext_links2.int_node.port_buffers02 system.ruby.network.ext_links2.int_node.port_buffers03 system.ruby.network.ext_links2.int_node.port_buffers04 system.ruby.network.ext_links2.int_node.port_buffers05 system.ruby.network.ext_links2.int_node.port_buffers06 system.ruby.network.ext_links2.int_node.port_buffers07 system.ruby.network.ext_links2.int_node.port_buffers08 system.ruby.network.ext_links2.int_node.port_buffers09 system.ruby.network.ext_links2.int_node.port_buffers10 system.ruby.network.ext_links2.int_node.port_buffers11 system.ruby.network.ext_links2.int_node.port_buffers12 system.ruby.network.ext_links2.int_node.port_buffers13 system.ruby.network.ext_links2.int_node.port_buffers14 system.ruby.network.ext_links2.int_node.port_buffers15 system.ruby.network.ext_links2.int_node.port_buffers16 system.ruby.network.ext_links2.int_node.port_buffers17 system.ruby.network.ext_links2.int_node.port_buffers18 system.ruby.network.ext_links2.int_node.port_buffers19 system.ruby.network.ext_links2.int_node.port_buffers20 system.ruby.network.ext_links2.int_node.port_buffers21 system.ruby.network.ext_links2.int_node.port_buffers22 system.ruby.network.ext_links2.int_node.port_buffers23 system.ruby.network.ext_links2.int_node.port_buffers24 system.ruby.network.ext_links2.int_node.port_buffers25 system.ruby.network.ext_links2.int_node.port_buffers26 system.ruby.network.ext_links2.int_node.port_buffers27 system.ruby.network.ext_links2.int_node.port_buffers28 system.ruby.network.ext_links2.int_node.port_buffers29 system.ruby.network.ext_links2.int_node.port_buffers30 system.ruby.network.ext_links2.int_node.port_buffers31 system.ruby.network.ext_links2.int_node.port_buffers32 system.ruby.network.ext_links2.int_node.port_buffers33 system.ruby.network.ext_links2.int_node.port_buffers34 system.ruby.network.ext_links2.int_node.port_buffers35 system.ruby.network.ext_links2.int_node.port_buffers36 system.ruby.network.ext_links2.int_node.port_buffers37 system.ruby.network.ext_links2.int_node.port_buffers38 system.ruby.network.ext_links2.int_node.port_buffers39 system.ruby.network.ext_links2.int_node.port_buffers40 system.ruby.network.ext_links2.int_node.port_buffers41 system.ruby.network.ext_links2.int_node.port_buffers42 system.ruby.network.ext_links2.int_node.port_buffers43 system.ruby.network.ext_links2.int_node.port_buffers44 system.ruby.network.ext_links2.int_node.port_buffers45 system.ruby.network.ext_links2.int_node.port_buffers46 system.ruby.network.ext_links2.int_node.port_buffers47 system.ruby.network.ext_links2.int_node.port_buffers48 system.ruby.network.ext_links2.int_node.port_buffers49 system.ruby.network.ext_links2.int_node.port_buffers50 system.ruby.network.ext_links2.int_node.port_buffers51 system.ruby.network.ext_links2.int_node.port_buffers52 system.ruby.network.ext_links2.int_node.port_buffers53 system.ruby.network.ext_links2.int_node.port_buffers54 system.ruby.network.ext_links2.int_node.port_buffers55 system.ruby.network.ext_links2.int_node.port_buffers56 system.ruby.network.ext_links2.int_node.port_buffers57 system.ruby.network.ext_links2.int_node.port_buffers58 system.ruby.network.ext_links2.int_node.port_buffers59 system.ruby.network.ext_links2.int_node.port_buffers60 system.ruby.network.ext_links2.int_node.port_buffers61 system.ruby.network.ext_links2.int_node.port_buffers62 system.ruby.network.ext_links2.int_node.port_buffers63 system.ruby.network.ext_links2.int_node.port_buffers64 system.ruby.network.ext_links2.int_node.port_buffers65 system.ruby.network.ext_links2.int_node.port_buffers66 system.ruby.network.ext_links2.int_node.port_buffers67 system.ruby.network.ext_links2.int_node.port_buffers68 system.ruby.network.ext_links2.int_node.port_buffers69 system.ruby.network.ext_links2.int_node.port_buffers70 system.ruby.network.ext_links2.int_node.port_buffers71 system.ruby.network.ext_links2.int_node.port_buffers72 system.ruby.network.ext_links2.int_node.port_buffers73 system.ruby.network.ext_links2.int_node.port_buffers74 system.ruby.network.ext_links2.int_node.port_buffers75 system.ruby.network.ext_links2.int_node.port_buffers76 system.ruby.network.ext_links2.int_node.port_buffers77 system.ruby.network.ext_links2.int_node.port_buffers78 system.ruby.network.ext_links2.int_node.port_buffers79
+router_id=2
+virt_nets=10
+
+[system.ruby.network.ext_links2.int_node.port_buffers00]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers01]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers02]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers03]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers04]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers05]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers06]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers07]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers08]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers09]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers10]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers11]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers12]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers13]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers14]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers15]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers16]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers17]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers18]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers19]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers20]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers21]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers22]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers23]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers24]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers25]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers26]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers27]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers28]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers29]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers30]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers31]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers32]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers33]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers34]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers35]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers36]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers37]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers38]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers39]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers40]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers41]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers42]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers43]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers44]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers45]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers46]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers47]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers48]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers49]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers50]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers51]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers52]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers53]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers54]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers55]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers56]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers57]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers58]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers59]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers60]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers61]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers62]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers63]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers64]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers65]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers66]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers67]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers68]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers69]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers70]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers71]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers72]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers73]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers74]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers75]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers76]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers77]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers78]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers79]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links3]
+type=SimpleExtLink
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.tcp_cntrl1
+int_node=system.ruby.network.ext_links2.int_node
+latency=1
+link_id=3
+weight=1
+
+[system.ruby.network.ext_links4]
+type=SimpleExtLink
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.sqc_cntrl0
+int_node=system.ruby.network.ext_links2.int_node
+latency=1
+link_id=4
+weight=1
+
+[system.ruby.network.ext_links5]
+type=SimpleExtLink
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.tcc_cntrl0
+int_node=system.ruby.network.ext_links2.int_node
+latency=1
+link_id=5
+weight=1
+
+[system.ruby.network.ext_links6]
+type=SimpleExtLink
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.tccdir_cntrl0
+int_node=system.ruby.network.ext_links2.int_node
+latency=1
+link_id=6
+weight=1
+
+[system.ruby.network.int_link_buffers00]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers01]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers02]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers03]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers04]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers05]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers06]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers07]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers08]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers09]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers10]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers11]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers12]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers13]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers14]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers15]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers16]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers17]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers18]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers19]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers20]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers21]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers22]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers23]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers24]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers25]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers26]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers27]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers28]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers29]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers30]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers31]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers32]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers33]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers34]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers35]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers36]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers37]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers38]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers39]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_links0]
+type=SimpleIntLink
+bandwidth_factor=512
+eventq_index=0
+latency=1
+link_id=0
+node_a=system.ruby.network.ext_links0.int_node
+node_b=system.ruby.network.ext_links1.int_node
+weight=1
+
+[system.ruby.network.int_links1]
+type=SimpleIntLink
+bandwidth_factor=512
+eventq_index=0
+latency=1
+link_id=1
+node_a=system.ruby.network.ext_links0.int_node
+node_b=system.ruby.network.ext_links2.int_node
+weight=1
+
+[system.ruby.phys_mem]
+type=SimpleMemory
+bandwidth=73.000000
+clk_domain=system.ruby.clk_domain
+conf_table_reported=true
+eventq_index=0
+in_addr_map=false
+latency=30000
+latency_var=0
+null=false
+range=0:536870911
+
+[system.sqc_cntrl0]
+type=SQC_Controller
+children=L1cache mandatoryQueue probeToSQC requestFromSQC responseFromSQC responseToSQC sequencer unblockFromCore
+L1cache=system.sqc_cntrl0.L1cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+eventq_index=0
+issue_latency=80
+l2_hit_latency=18
+mandatoryQueue=system.sqc_cntrl0.mandatoryQueue
+number_of_TBEs=256
+probeToSQC=system.sqc_cntrl0.probeToSQC
+recycle_latency=10
+requestFromSQC=system.sqc_cntrl0.requestFromSQC
+responseFromSQC=system.sqc_cntrl0.responseFromSQC
+responseToSQC=system.sqc_cntrl0.responseToSQC
+ruby_system=system.ruby
+sequencer=system.sqc_cntrl0.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.sqc_cntrl0.unblockFromCore
+version=0
+
+[system.sqc_cntrl0.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.sqc_cntrl0.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=32768
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=4
+
+[system.sqc_cntrl0.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=32768
+
+[system.sqc_cntrl0.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.sqc_cntrl0.probeToSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[9]
+
+[system.sqc_cntrl0.requestFromSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[11]
+
+[system.sqc_cntrl0.responseFromSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[12]
+
+[system.sqc_cntrl0.responseToSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[10]
+
+[system.sqc_cntrl0.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.sqc_cntrl0.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.sqc_cntrl0.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=false
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=6
+slave=system.cpu1.CUs0.sqc_port system.cpu1.CUs1.sqc_port
+
+[system.sqc_cntrl0.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[13]
+
+[system.sqc_coalescer]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.sqc_coalescer.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.sqc_tlb.slave[0]
+slave=system.cpu1.CUs0.sqc_tlb_port system.cpu1.CUs1.sqc_tlb_port
+
+[system.sqc_coalescer.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.sqc_coalescer.clk_domain.voltage_domain
+
+[system.sqc_coalescer.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.sqc_tlb]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.sqc_tlb.clk_domain
+eventq_index=0
+hitLatency=1
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=32
+master=system.l2_coalescer.slave[0]
+slave=system.sqc_coalescer.master[0]
+
+[system.sqc_tlb.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.sqc_tlb.clk_domain.voltage_domain
+
+[system.sqc_tlb.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.sys_port_proxy]
+type=RubyPortProxy
+clk_domain=system.clk_domain
+eventq_index=0
+is_cpu_sequencer=true
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_ruby_tester=false
+version=0
+slave=system.system_port
+
+[system.tcc_cntrl0]
+type=TCC_Controller
+children=L2cache responseFromTCC responseToTCC w_TCCUnblockToTCCDir w_probeToTCC w_reqToTCC w_reqToTCCDir w_respToTCC w_respToTCCDir
+L2cache=system.tcc_cntrl0.L2cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+eventq_index=0
+l2_request_latency=1
+l2_response_latency=16
+number_of_TBEs=2048
+recycle_latency=10
+responseFromTCC=system.tcc_cntrl0.responseFromTCC
+responseToTCC=system.tcc_cntrl0.responseToTCC
+ruby_system=system.ruby
+system=system
+transitions_per_cycle=32
+version=0
+w_TCCUnblockToTCCDir=system.tcc_cntrl0.w_TCCUnblockToTCCDir
+w_probeToTCC=system.tcc_cntrl0.w_probeToTCC
+w_reqToTCC=system.tcc_cntrl0.w_reqToTCC
+w_reqToTCCDir=system.tcc_cntrl0.w_reqToTCCDir
+w_respToTCC=system.tcc_cntrl0.w_respToTCC
+w_respToTCCDir=system.tcc_cntrl0.w_respToTCCDir
+
+[system.tcc_cntrl0.L2cache]
+type=RubyCache
+children=replacement_policy
+assoc=16
+block_size=0
+dataAccessLatency=8
+dataArrayBanks=256
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcc_cntrl0.L2cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=262144.0
+start_index_bit=6
+tagAccessLatency=2
+tagArrayBanks=256
+
+[system.tcc_cntrl0.L2cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=16
+block_size=64
+eventq_index=0
+size=262144.0
+
+[system.tcc_cntrl0.responseFromTCC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[14]
+
+[system.tcc_cntrl0.responseToTCC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[11]
+
+[system.tcc_cntrl0.w_TCCUnblockToTCCDir]
+type=RubyWireBuffer
+eventq_index=0
+ruby_system=system.ruby
+
+[system.tcc_cntrl0.w_probeToTCC]
+type=RubyWireBuffer
+eventq_index=0
+ruby_system=system.ruby
+
+[system.tcc_cntrl0.w_reqToTCC]
+type=RubyWireBuffer
+eventq_index=0
+ruby_system=system.ruby
+
+[system.tcc_cntrl0.w_reqToTCCDir]
+type=RubyWireBuffer
+eventq_index=0
+ruby_system=system.ruby
+
+[system.tcc_cntrl0.w_respToTCC]
+type=RubyWireBuffer
+eventq_index=0
+ruby_system=system.ruby
+
+[system.tcc_cntrl0.w_respToTCCDir]
+type=RubyWireBuffer
+eventq_index=0
+ruby_system=system.ruby
+
+[system.tccdir_cntrl0]
+type=TCCdir_Controller
+children=directory probeFromNB probeToCore requestFromTCP requestToNB responseFromNB responseFromTCP responseToCore responseToNB triggerQueue unblockFromTCP unblockToNB
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+directory=system.tccdir_cntrl0.directory
+directory_latency=6
+eventq_index=0
+issue_latency=120
+number_of_TBEs=1024
+probeFromNB=system.tccdir_cntrl0.probeFromNB
+probeToCore=system.tccdir_cntrl0.probeToCore
+recycle_latency=10
+requestFromTCP=system.tccdir_cntrl0.requestFromTCP
+requestToNB=system.tccdir_cntrl0.requestToNB
+responseFromNB=system.tccdir_cntrl0.responseFromNB
+responseFromTCP=system.tccdir_cntrl0.responseFromTCP
+responseToCore=system.tccdir_cntrl0.responseToCore
+responseToNB=system.tccdir_cntrl0.responseToNB
+response_latency=5
+ruby_system=system.ruby
+system=system
+transitions_per_cycle=32
+triggerQueue=system.tccdir_cntrl0.triggerQueue
+unblockFromTCP=system.tccdir_cntrl0.unblockFromTCP
+unblockToNB=system.tccdir_cntrl0.unblockToNB
+version=0
+w_TCCUnblockToTCCDir=system.tcc_cntrl0.w_TCCUnblockToTCCDir
+w_probeToTCC=system.tcc_cntrl0.w_probeToTCC
+w_reqToTCC=system.tcc_cntrl0.w_reqToTCC
+w_reqToTCCDir=system.tcc_cntrl0.w_reqToTCCDir
+w_respToTCC=system.tcc_cntrl0.w_respToTCC
+w_respToTCCDir=system.tcc_cntrl0.w_respToTCCDir
+
+[system.tccdir_cntrl0.directory]
+type=RubyCache
+children=replacement_policy
+assoc=16
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=1
+eventq_index=0
+is_icache=false
+replacement_policy=system.tccdir_cntrl0.directory.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=393216
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=1
+
+[system.tccdir_cntrl0.directory.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=16
+block_size=64
+eventq_index=0
+size=393216
+
+[system.tccdir_cntrl0.probeFromNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[15]
+
+[system.tccdir_cntrl0.probeToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[15]
+
+[system.tccdir_cntrl0.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[12]
+
+[system.tccdir_cntrl0.requestToNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[17]
+
+[system.tccdir_cntrl0.responseFromNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[16]
+
+[system.tccdir_cntrl0.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[13]
+
+[system.tccdir_cntrl0.responseToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[16]
+
+[system.tccdir_cntrl0.responseToNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[18]
+
+[system.tccdir_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.tccdir_cntrl0.unblockFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[14]
+
+[system.tccdir_cntrl0.unblockToNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[19]
+
+[system.tcp_cntrl0]
+type=TCP_Controller
+children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore
+L1cache=system.tcp_cntrl0.L1cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+coalescer=system.tcp_cntrl0.coalescer
+eventq_index=0
+issue_latency=40
+l2_hit_latency=18
+mandatoryQueue=system.tcp_cntrl0.mandatoryQueue
+number_of_TBEs=2560
+probeToTCP=system.tcp_cntrl0.probeToTCP
+recycle_latency=10
+requestFromTCP=system.tcp_cntrl0.requestFromTCP
+responseFromTCP=system.tcp_cntrl0.responseFromTCP
+responseToTCP=system.tcp_cntrl0.responseToTCP
+ruby_system=system.ruby
+sequencer=system.tcp_cntrl0.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.tcp_cntrl0.unblockFromCore
+use_seq_not_coal=false
+version=0
+
+[system.tcp_cntrl0.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcp_cntrl0.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=16384
+start_index_bit=6
+tagAccessLatency=4
+tagArrayBanks=4
+
+[system.tcp_cntrl0.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=16384
+
+[system.tcp_cntrl0.coalescer]
+type=RubyGPUCoalescer
+assume_rfo=true
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl0.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl0.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_outstanding_requests=2048
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=false
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=2
+slave=system.cpu1.CUs0.memory_port[0] system.cpu1.CUs0.memory_port[1] system.cpu1.CUs0.memory_port[2] system.cpu1.CUs0.memory_port[3] system.cpu1.CUs0.memory_port[4] system.cpu1.CUs0.memory_port[5] system.cpu1.CUs0.memory_port[6] system.cpu1.CUs0.memory_port[7] system.cpu1.CUs0.memory_port[8] system.cpu1.CUs0.memory_port[9] system.cpu1.CUs0.memory_port[10] system.cpu1.CUs0.memory_port[11] system.cpu1.CUs0.memory_port[12] system.cpu1.CUs0.memory_port[13] system.cpu1.CUs0.memory_port[14] system.cpu1.CUs0.memory_port[15] system.cpu1.CUs0.memory_port[16] system.cpu1.CUs0.memory_port[17] system.cpu1.CUs0.memory_port[18] system.cpu1.CUs0.memory_port[19] system.cpu1.CUs0.memory_port[20] system.cpu1.CUs0.memory_port[21] system.cpu1.CUs0.memory_port[22] system.cpu1.CUs0.memory_port[23] system.cpu1.CUs0.memory_port[24] system.cpu1.CUs0.memory_port[25] system.cpu1.CUs0.memory_port[26] system.cpu1.CUs0.memory_port[27] system.cpu1.CUs0.memory_port[28] system.cpu1.CUs0.memory_port[29] system.cpu1.CUs0.memory_port[30] system.cpu1.CUs0.memory_port[31] system.cpu1.CUs0.memory_port[32] system.cpu1.CUs0.memory_port[33] system.cpu1.CUs0.memory_port[34] system.cpu1.CUs0.memory_port[35] system.cpu1.CUs0.memory_port[36] system.cpu1.CUs0.memory_port[37] system.cpu1.CUs0.memory_port[38] system.cpu1.CUs0.memory_port[39] system.cpu1.CUs0.memory_port[40] system.cpu1.CUs0.memory_port[41] system.cpu1.CUs0.memory_port[42] system.cpu1.CUs0.memory_port[43] system.cpu1.CUs0.memory_port[44] system.cpu1.CUs0.memory_port[45] system.cpu1.CUs0.memory_port[46] system.cpu1.CUs0.memory_port[47] system.cpu1.CUs0.memory_port[48] system.cpu1.CUs0.memory_port[49] system.cpu1.CUs0.memory_port[50] system.cpu1.CUs0.memory_port[51] system.cpu1.CUs0.memory_port[52] system.cpu1.CUs0.memory_port[53] system.cpu1.CUs0.memory_port[54] system.cpu1.CUs0.memory_port[55] system.cpu1.CUs0.memory_port[56] system.cpu1.CUs0.memory_port[57] system.cpu1.CUs0.memory_port[58] system.cpu1.CUs0.memory_port[59] system.cpu1.CUs0.memory_port[60] system.cpu1.CUs0.memory_port[61] system.cpu1.CUs0.memory_port[62] system.cpu1.CUs0.memory_port[63]
+
+[system.tcp_cntrl0.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.tcp_cntrl0.probeToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[5]
+
+[system.tcp_cntrl0.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[5]
+
+[system.tcp_cntrl0.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[6]
+
+[system.tcp_cntrl0.responseToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[6]
+
+[system.tcp_cntrl0.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl0.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl0.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=3
+
+[system.tcp_cntrl0.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[7]
+
+[system.tcp_cntrl1]
+type=TCP_Controller
+children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore
+L1cache=system.tcp_cntrl1.L1cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+coalescer=system.tcp_cntrl1.coalescer
+eventq_index=0
+issue_latency=40
+l2_hit_latency=18
+mandatoryQueue=system.tcp_cntrl1.mandatoryQueue
+number_of_TBEs=2560
+probeToTCP=system.tcp_cntrl1.probeToTCP
+recycle_latency=10
+requestFromTCP=system.tcp_cntrl1.requestFromTCP
+responseFromTCP=system.tcp_cntrl1.responseFromTCP
+responseToTCP=system.tcp_cntrl1.responseToTCP
+ruby_system=system.ruby
+sequencer=system.tcp_cntrl1.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.tcp_cntrl1.unblockFromCore
+use_seq_not_coal=false
+version=1
+
+[system.tcp_cntrl1.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcp_cntrl1.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=16384
+start_index_bit=6
+tagAccessLatency=4
+tagArrayBanks=4
+
+[system.tcp_cntrl1.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=16384
+
+[system.tcp_cntrl1.coalescer]
+type=RubyGPUCoalescer
+assume_rfo=true
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl1.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl1.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_outstanding_requests=2048
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=false
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=4
+slave=system.cpu1.CUs1.memory_port[0] system.cpu1.CUs1.memory_port[1] system.cpu1.CUs1.memory_port[2] system.cpu1.CUs1.memory_port[3] system.cpu1.CUs1.memory_port[4] system.cpu1.CUs1.memory_port[5] system.cpu1.CUs1.memory_port[6] system.cpu1.CUs1.memory_port[7] system.cpu1.CUs1.memory_port[8] system.cpu1.CUs1.memory_port[9] system.cpu1.CUs1.memory_port[10] system.cpu1.CUs1.memory_port[11] system.cpu1.CUs1.memory_port[12] system.cpu1.CUs1.memory_port[13] system.cpu1.CUs1.memory_port[14] system.cpu1.CUs1.memory_port[15] system.cpu1.CUs1.memory_port[16] system.cpu1.CUs1.memory_port[17] system.cpu1.CUs1.memory_port[18] system.cpu1.CUs1.memory_port[19] system.cpu1.CUs1.memory_port[20] system.cpu1.CUs1.memory_port[21] system.cpu1.CUs1.memory_port[22] system.cpu1.CUs1.memory_port[23] system.cpu1.CUs1.memory_port[24] system.cpu1.CUs1.memory_port[25] system.cpu1.CUs1.memory_port[26] system.cpu1.CUs1.memory_port[27] system.cpu1.CUs1.memory_port[28] system.cpu1.CUs1.memory_port[29] system.cpu1.CUs1.memory_port[30] system.cpu1.CUs1.memory_port[31] system.cpu1.CUs1.memory_port[32] system.cpu1.CUs1.memory_port[33] system.cpu1.CUs1.memory_port[34] system.cpu1.CUs1.memory_port[35] system.cpu1.CUs1.memory_port[36] system.cpu1.CUs1.memory_port[37] system.cpu1.CUs1.memory_port[38] system.cpu1.CUs1.memory_port[39] system.cpu1.CUs1.memory_port[40] system.cpu1.CUs1.memory_port[41] system.cpu1.CUs1.memory_port[42] system.cpu1.CUs1.memory_port[43] system.cpu1.CUs1.memory_port[44] system.cpu1.CUs1.memory_port[45] system.cpu1.CUs1.memory_port[46] system.cpu1.CUs1.memory_port[47] system.cpu1.CUs1.memory_port[48] system.cpu1.CUs1.memory_port[49] system.cpu1.CUs1.memory_port[50] system.cpu1.CUs1.memory_port[51] system.cpu1.CUs1.memory_port[52] system.cpu1.CUs1.memory_port[53] system.cpu1.CUs1.memory_port[54] system.cpu1.CUs1.memory_port[55] system.cpu1.CUs1.memory_port[56] system.cpu1.CUs1.memory_port[57] system.cpu1.CUs1.memory_port[58] system.cpu1.CUs1.memory_port[59] system.cpu1.CUs1.memory_port[60] system.cpu1.CUs1.memory_port[61] system.cpu1.CUs1.memory_port[62] system.cpu1.CUs1.memory_port[63]
+
+[system.tcp_cntrl1.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.tcp_cntrl1.probeToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[7]
+
+[system.tcp_cntrl1.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[8]
+
+[system.tcp_cntrl1.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[9]
+
+[system.tcp_cntrl1.responseToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[8]
+
+[system.tcp_cntrl1.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl1.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl1.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=5
+
+[system.tcp_cntrl1.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[10]
+
+[system.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/simerr b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/simerr
new file mode 100755
index 000000000..1e2b8911e
--- /dev/null
+++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/simerr
@@ -0,0 +1,5 @@
+warn: system.ruby.network adopting orphan SimObject param 'int_links'
+warn: system.ruby.network adopting orphan SimObject param 'ext_links'
+warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes)
+warn: Sockets disabled, not accepting gdb connections
+warn: Replacement policy updates recently became the responsibility of SLICC state machines. Make sure to setMRU() near callbacks in .sm files!
diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/simout b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/simout
new file mode 100755
index 000000000..98757d4d3
--- /dev/null
+++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/simout
@@ -0,0 +1,21 @@
+gem5 Simulator System.  http://gem5.org
+gem5 is copyrighted software; use the --copyright option for details.
+
+gem5 compiled Jan 19 2016 13:28:55
+gem5 started Jan 19 2016 13:29:16
+gem5 executing on zizzer, pid 48854
+command line: build/HSAIL_X86/gem5.opt -d build/HSAIL_X86/tests/opt/quick/se/04.gpu/x86/linux/gpu-ruby-GPU_RfO -re /z/atgutier/gem5/gem5-commit/tests/run.py build/HSAIL_X86/tests/opt/quick/se/04.gpu/x86/linux/gpu-ruby-GPU_RfO
+
+Using GPU kernel code file(s) /dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm
+Global frequency set at 1000000000000 ticks per second
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+info: Entering event queue @ 0.  Starting simulation...
+keys = 0x7b2bc0, &keys = 0x798998, keys[0] = 23
+the gpu says:
+elloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloe
+Exiting @ tick 663454500 because target called exit()
diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/stats.txt b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/stats.txt
new file mode 100644
index 000000000..ac9e12c7a
--- /dev/null
+++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_RfO/stats.txt
@@ -0,0 +1,3202 @@
+
+---------- Begin Simulation Statistics ----------
+sim_seconds                                  0.000663                       # Number of seconds simulated
+sim_ticks                                   663454500                       # Number of ticks simulated
+final_tick                                  663454500                       # Number of ticks from beginning of simulation (restored from checkpoints and never reset)
+sim_freq                                 1000000000000                       # Frequency of simulated ticks
+host_inst_rate                                  63999                       # Simulator instruction rate (inst/s)
+host_op_rate                                   131608                       # Simulator op (including micro ops) rate (op/s)
+host_tick_rate                              634065338                       # Simulator tick rate (ticks/s)
+host_mem_usage                                1301448                       # Number of bytes of host memory used
+host_seconds                                     1.05                       # Real time elapsed on the host
+sim_insts                                       66963                       # Number of instructions simulated
+sim_ops                                        137705                       # Number of ops (including micro ops) simulated
+system.voltage_domain.voltage                       1                       # Voltage in Volts
+system.clk_domain.clock                          1000                       # Clock period in ticks
+system.mem_ctrls.bytes_read::dir_cntrl0         99264                       # Number of bytes read from this memory
+system.mem_ctrls.bytes_read::total              99264                       # Number of bytes read from this memory
+system.mem_ctrls.num_reads::dir_cntrl0           1551                       # Number of read requests responded to by this memory
+system.mem_ctrls.num_reads::total                1551                       # Number of read requests responded to by this memory
+system.mem_ctrls.bw_read::dir_cntrl0        149616892                       # Total read bandwidth from this memory (bytes/s)
+system.mem_ctrls.bw_read::total             149616892                       # Total read bandwidth from this memory (bytes/s)
+system.mem_ctrls.bw_total::dir_cntrl0       149616892                       # Total bandwidth to/from this memory (bytes/s)
+system.mem_ctrls.bw_total::total            149616892                       # Total bandwidth to/from this memory (bytes/s)
+system.mem_ctrls.readReqs                        1551                       # Number of read requests accepted
+system.mem_ctrls.writeReqs                          0                       # Number of write requests accepted
+system.mem_ctrls.readBursts                      1551                       # Number of DRAM read bursts, including those serviced by the write queue
+system.mem_ctrls.writeBursts                        0                       # Number of DRAM write bursts, including those merged in the write queue
+system.mem_ctrls.bytesReadDRAM                  99264                       # Total number of bytes read from DRAM
+system.mem_ctrls.bytesReadWrQ                       0                       # Total number of bytes read from write queue
+system.mem_ctrls.bytesWritten                       0                       # Total number of bytes written to DRAM
+system.mem_ctrls.bytesReadSys                   99264                       # Total read bytes from the system interface side
+system.mem_ctrls.bytesWrittenSys                    0                       # Total written bytes from the system interface side
+system.mem_ctrls.servicedByWrQ                      0                       # Number of DRAM read bursts serviced by the write queue
+system.mem_ctrls.mergedWrBursts                     0                       # Number of DRAM write bursts merged with an existing one
+system.mem_ctrls.neitherReadNorWriteReqs            0                       # Number of requests that are neither read nor write
+system.mem_ctrls.perBankRdBursts::0               122                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::1               192                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::2                93                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::3                44                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::4                61                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::5                79                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::6                52                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::7                42                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::8                54                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::9                56                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::10              174                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::11               90                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::12              222                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::13              125                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::14               51                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::15               94                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::0                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::1                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::2                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::3                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::4                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::5                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::6                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::7                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::8                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::9                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::10                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::11                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::12                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::13                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::14                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::15                0                       # Per bank write bursts
+system.mem_ctrls.numRdRetry                         0                       # Number of times read queue was full causing retry
+system.mem_ctrls.numWrRetry                         0                       # Number of times write queue was full causing retry
+system.mem_ctrls.totGap                     663221000                       # Total gap between requests
+system.mem_ctrls.readPktSize::0                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::1                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::2                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::3                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::4                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::5                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::6                  1551                       # Read request sizes (log2)
+system.mem_ctrls.writePktSize::0                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::1                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::2                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::3                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::4                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::5                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::6                    0                       # Write request sizes (log2)
+system.mem_ctrls.rdQLenPdf::0                    1542                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::1                       2                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::2                       1                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::3                       1                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::4                       2                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::5                       3                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::6                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::7                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::8                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::9                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::10                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::11                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::12                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::13                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::14                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::15                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::16                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::17                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::18                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::19                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::20                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::21                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::22                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::23                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::24                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::25                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::26                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::27                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::28                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::29                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::30                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::31                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::0                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::1                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::2                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::3                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::4                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::5                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::6                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::7                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::8                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::9                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::10                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::11                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::12                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::13                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::14                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::15                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::16                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::17                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::18                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::19                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::20                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::21                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::22                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::23                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::24                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::25                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::26                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::27                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::28                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::29                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::30                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::31                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::32                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::33                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::34                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::35                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::36                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::37                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::38                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::39                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::40                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::41                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::42                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::43                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::44                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::45                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::46                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::47                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::48                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::49                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::50                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::51                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::52                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::53                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::54                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::55                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::56                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::57                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::58                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::59                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::60                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::61                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::62                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::63                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.bytesPerActivate::samples          485                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::mean    204.008247                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::gmean   145.772769                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::stdev   192.306659                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::0-127          178     36.70%     36.70% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::128-255          156     32.16%     68.87% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::256-383           70     14.43%     83.30% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::384-511           40      8.25%     91.55% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::512-639           15      3.09%     94.64% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::640-767           10      2.06%     96.70% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::768-895            9      1.86%     98.56% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::896-1023            2      0.41%     98.97% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::1024-1151            5      1.03%    100.00% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::total          485                       # Bytes accessed per row activation
+system.mem_ctrls.totQLat                     15500500                       # Total ticks spent queuing
+system.mem_ctrls.totMemAccLat                44581750                       # Total ticks spent from burst creation until serviced by the DRAM
+system.mem_ctrls.totBusLat                    7755000                       # Total ticks spent in databus transfers
+system.mem_ctrls.avgQLat                      9993.87                       # Average queueing delay per DRAM burst
+system.mem_ctrls.avgBusLat                    5000.00                       # Average bus latency per DRAM burst
+system.mem_ctrls.avgMemAccLat                28743.87                       # Average memory access latency per DRAM burst
+system.mem_ctrls.avgRdBW                       149.62                       # Average DRAM read bandwidth in MiByte/s
+system.mem_ctrls.avgWrBW                         0.00                       # Average achieved write bandwidth in MiByte/s
+system.mem_ctrls.avgRdBWSys                    149.62                       # Average system read bandwidth in MiByte/s
+system.mem_ctrls.avgWrBWSys                      0.00                       # Average system write bandwidth in MiByte/s
+system.mem_ctrls.peakBW                      12800.00                       # Theoretical peak bandwidth in MiByte/s
+system.mem_ctrls.busUtil                         1.17                       # Data bus utilization in percentage
+system.mem_ctrls.busUtilRead                     1.17                       # Data bus utilization in percentage for reads
+system.mem_ctrls.busUtilWrite                    0.00                       # Data bus utilization in percentage for writes
+system.mem_ctrls.avgRdQLen                       1.00                       # Average read queue length when enqueuing
+system.mem_ctrls.avgWrQLen                       0.00                       # Average write queue length when enqueuing
+system.mem_ctrls.readRowHits                     1062                       # Number of row buffer hits during reads
+system.mem_ctrls.writeRowHits                       0                       # Number of row buffer hits during writes
+system.mem_ctrls.readRowHitRate                 68.47                       # Row buffer hit rate for reads
+system.mem_ctrls.writeRowHitRate                  nan                       # Row buffer hit rate for writes
+system.mem_ctrls.avgGap                     427608.64                       # Average gap between requests
+system.mem_ctrls.pageHitRate                    68.47                       # Row buffer hit rate, read and write combined
+system.mem_ctrls_0.actEnergy                  1391040                       # Energy for activate commands per rank (pJ)
+system.mem_ctrls_0.preEnergy                   759000                       # Energy for precharge commands per rank (pJ)
+system.mem_ctrls_0.readEnergy                 5335200                       # Energy for read commands per rank (pJ)
+system.mem_ctrls_0.writeEnergy                      0                       # Energy for write commands per rank (pJ)
+system.mem_ctrls_0.refreshEnergy             43227600                       # Energy for refresh commands per rank (pJ)
+system.mem_ctrls_0.actBackEnergy            335485755                       # Energy for active background per rank (pJ)
+system.mem_ctrls_0.preBackEnergy            102969000                       # Energy for precharge background per rank (pJ)
+system.mem_ctrls_0.totalEnergy              489167595                       # Total energy per rank (pJ)
+system.mem_ctrls_0.averagePower            738.822020                       # Core power per rank (mW)
+system.mem_ctrls_0.memoryStateTime::IDLE    170399250                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::REF      22100000                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::PRE_PDN            0                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::ACT     470741750                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::ACT_PDN            0                       # Time in different power states
+system.mem_ctrls_1.actEnergy                  2275560                       # Energy for activate commands per rank (pJ)
+system.mem_ctrls_1.preEnergy                  1241625                       # Energy for precharge commands per rank (pJ)
+system.mem_ctrls_1.readEnergy                 6723600                       # Energy for read commands per rank (pJ)
+system.mem_ctrls_1.writeEnergy                      0                       # Energy for write commands per rank (pJ)
+system.mem_ctrls_1.refreshEnergy             43227600                       # Energy for refresh commands per rank (pJ)
+system.mem_ctrls_1.actBackEnergy            371983995                       # Energy for active background per rank (pJ)
+system.mem_ctrls_1.preBackEnergy             70953000                       # Energy for precharge background per rank (pJ)
+system.mem_ctrls_1.totalEnergy              496405380                       # Total energy per rank (pJ)
+system.mem_ctrls_1.averagePower            749.753724                       # Core power per rank (mW)
+system.mem_ctrls_1.memoryStateTime::IDLE    115859750                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::REF      22100000                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::PRE_PDN            0                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::ACT     524145250                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::ACT_PDN            0                       # Time in different power states
+system.ruby.clk_domain.clock                      500                       # Clock period in ticks
+system.ruby.phys_mem.bytes_read::cpu0.inst       696760                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_read::cpu0.data       119832                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_read::cpu1.CUs0.ComputeUnit         3280                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_read::cpu1.CUs1.ComputeUnit         3280                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_read::total         823152                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_inst_read::cpu0.inst       696760                       # Number of instructions bytes read from this memory
+system.ruby.phys_mem.bytes_inst_read::cpu1.CUs0.ComputeUnit         2000                       # Number of instructions bytes read from this memory
+system.ruby.phys_mem.bytes_inst_read::cpu1.CUs1.ComputeUnit         2000                       # Number of instructions bytes read from this memory
+system.ruby.phys_mem.bytes_inst_read::total       700760                       # Number of instructions bytes read from this memory
+system.ruby.phys_mem.bytes_written::cpu0.data        72767                       # Number of bytes written to this memory
+system.ruby.phys_mem.bytes_written::cpu1.CUs0.ComputeUnit          256                       # Number of bytes written to this memory
+system.ruby.phys_mem.bytes_written::cpu1.CUs1.ComputeUnit          256                       # Number of bytes written to this memory
+system.ruby.phys_mem.bytes_written::total        73279                       # Number of bytes written to this memory
+system.ruby.phys_mem.num_reads::cpu0.inst        87095                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_reads::cpu0.data        16686                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_reads::cpu1.CUs0.ComputeUnit          555                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_reads::cpu1.CUs1.ComputeUnit          555                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_reads::total          104891                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_writes::cpu0.data        10422                       # Number of write requests responded to by this memory
+system.ruby.phys_mem.num_writes::cpu1.CUs0.ComputeUnit          256                       # Number of write requests responded to by this memory
+system.ruby.phys_mem.num_writes::cpu1.CUs1.ComputeUnit          256                       # Number of write requests responded to by this memory
+system.ruby.phys_mem.num_writes::total          10934                       # Number of write requests responded to by this memory
+system.ruby.phys_mem.bw_read::cpu0.inst    1050200127                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_read::cpu0.data     180618264                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_read::cpu1.CUs0.ComputeUnit      4943821                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_read::cpu1.CUs1.ComputeUnit      4943821                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_read::total        1240706032                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_inst_read::cpu0.inst   1050200127                       # Instruction read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_inst_read::cpu1.CUs0.ComputeUnit      3014525                       # Instruction read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_inst_read::cpu1.CUs1.ComputeUnit      3014525                       # Instruction read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_inst_read::total   1056229176                       # Instruction read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_write::cpu0.data    109678961                       # Write bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_write::cpu1.CUs0.ComputeUnit       385859                       # Write bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_write::cpu1.CUs1.ComputeUnit       385859                       # Write bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_write::total        110450679                       # Write bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::cpu0.inst   1050200127                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::cpu0.data    290297225                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::cpu1.CUs0.ComputeUnit      5329680                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::cpu1.CUs1.ComputeUnit      5329680                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::total       1351156711                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.outstanding_req_hist::bucket_size            1                      
+system.ruby.outstanding_req_hist::max_bucket            9                      
+system.ruby.outstanding_req_hist::samples       114203                      
+system.ruby.outstanding_req_hist::mean       1.000035                      
+system.ruby.outstanding_req_hist::gmean      1.000024                      
+system.ruby.outstanding_req_hist::stdev      0.005918                      
+system.ruby.outstanding_req_hist         |           0      0.00%      0.00% |      114199    100.00%    100.00% |           4      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.outstanding_req_hist::total        114203                      
+system.ruby.latency_hist::bucket_size              64                      
+system.ruby.latency_hist::max_bucket              639                      
+system.ruby.latency_hist::samples              114203                      
+system.ruby.latency_hist::mean               4.784183                      
+system.ruby.latency_hist::gmean              2.131364                      
+system.ruby.latency_hist::stdev             23.846744                      
+system.ruby.latency_hist                 |      112668     98.66%     98.66% |           0      0.00%     98.66% |           0      0.00%     98.66% |        1506      1.32%     99.97% |          19      0.02%     99.99% |          10      0.01%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.latency_hist::total                114203                      
+system.ruby.hit_latency_hist::bucket_size           64                      
+system.ruby.hit_latency_hist::max_bucket          639                      
+system.ruby.hit_latency_hist::samples            1535                      
+system.ruby.hit_latency_hist::mean         208.449511                      
+system.ruby.hit_latency_hist::gmean        208.002927                      
+system.ruby.hit_latency_hist::stdev         15.847049                      
+system.ruby.hit_latency_hist             |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |        1506     98.11%     98.11% |          19      1.24%     99.35% |          10      0.65%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.hit_latency_hist::total              1535                      
+system.ruby.miss_latency_hist::bucket_size            4                      
+system.ruby.miss_latency_hist::max_bucket           39                      
+system.ruby.miss_latency_hist::samples         112668                      
+system.ruby.miss_latency_hist::mean          2.009426                      
+system.ruby.miss_latency_hist::gmean         2.002413                      
+system.ruby.miss_latency_hist::stdev         0.411800                      
+system.ruby.miss_latency_hist            |      112609     99.95%     99.95% |           0      0.00%     99.95% |           0      0.00%     99.95% |           0      0.00%     99.95% |           0      0.00%     99.95% |          59      0.05%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.miss_latency_hist::total           112668                      
+system.ruby.L1Cache.incomplete_times           112609                      
+system.ruby.L2Cache.incomplete_times               59                      
+system.cp_cntrl0.L1D0cache.demand_hits              0                       # Number of cache demand hits
+system.cp_cntrl0.L1D0cache.demand_misses          506                       # Number of cache demand misses
+system.cp_cntrl0.L1D0cache.demand_accesses          506                       # Number of cache demand accesses
+system.cp_cntrl0.L1D0cache.num_data_array_reads        16155                       # number of data array reads
+system.cp_cntrl0.L1D0cache.num_data_array_writes        11985                       # number of data array writes
+system.cp_cntrl0.L1D0cache.num_tag_array_reads        27132                       # number of tag array reads
+system.cp_cntrl0.L1D0cache.num_tag_array_writes         1584                       # number of tag array writes
+system.cp_cntrl0.L1D1cache.demand_hits              0                       # Number of cache demand hits
+system.cp_cntrl0.L1D1cache.demand_misses            0                       # Number of cache demand misses
+system.cp_cntrl0.L1D1cache.demand_accesses            0                       # Number of cache demand accesses
+system.cp_cntrl0.L1Icache.demand_hits               0                       # Number of cache demand hits
+system.cp_cntrl0.L1Icache.demand_misses          1088                       # Number of cache demand misses
+system.cp_cntrl0.L1Icache.demand_accesses         1088                       # Number of cache demand accesses
+system.cp_cntrl0.L1Icache.num_data_array_reads        86007                       # number of data array reads
+system.cp_cntrl0.L1Icache.num_data_array_writes           54                       # number of data array writes
+system.cp_cntrl0.L1Icache.num_tag_array_reads        87684                       # number of tag array reads
+system.cp_cntrl0.L1Icache.num_tag_array_writes           54                       # number of tag array writes
+system.cp_cntrl0.L2cache.demand_hits                0                       # Number of cache demand hits
+system.cp_cntrl0.L2cache.demand_misses           1535                       # Number of cache demand misses
+system.cp_cntrl0.L2cache.demand_accesses         1535                       # Number of cache demand accesses
+system.cp_cntrl0.L2cache.num_data_array_reads          120                       # number of data array reads
+system.cp_cntrl0.L2cache.num_data_array_writes        11982                       # number of data array writes
+system.cp_cntrl0.L2cache.num_tag_array_reads        12059                       # number of tag array reads
+system.cp_cntrl0.L2cache.num_tag_array_writes         1649                       # number of tag array writes
+system.cpu0.clk_domain.clock                      500                       # Clock period in ticks
+system.cpu0.apic_clk_domain.clock                8000                       # Clock period in ticks
+system.cpu0.workload.num_syscalls                  21                       # Number of system calls
+system.cpu0.numCycles                         1326909                       # number of cpu cycles simulated
+system.cpu0.numWorkItemsStarted                     0                       # number of work items this cpu started
+system.cpu0.numWorkItemsCompleted                   0                       # number of work items this cpu completed
+system.cpu0.committedInsts                      66963                       # Number of instructions committed
+system.cpu0.committedOps                       137705                       # Number of ops (including micro ops) committed
+system.cpu0.num_int_alu_accesses               136380                       # Number of integer alu accesses
+system.cpu0.num_fp_alu_accesses                  1279                       # Number of float alu accesses
+system.cpu0.num_func_calls                       3196                       # number of times a function call or return occured
+system.cpu0.num_conditional_control_insts        12151                       # number of instructions that are conditional controls
+system.cpu0.num_int_insts                      136380                       # number of integer instructions
+system.cpu0.num_fp_insts                         1279                       # number of float instructions
+system.cpu0.num_int_register_reads             257490                       # number of times the integer registers were read
+system.cpu0.num_int_register_writes            110039                       # number of times the integer registers were written
+system.cpu0.num_fp_register_reads                1981                       # number of times the floating registers were read
+system.cpu0.num_fp_register_writes                981                       # number of times the floating registers were written
+system.cpu0.num_cc_register_reads               78262                       # number of times the CC registers were read
+system.cpu0.num_cc_register_writes              42183                       # number of times the CC registers were written
+system.cpu0.num_mem_refs                        27198                       # number of memory refs
+system.cpu0.num_load_insts                      16684                       # Number of load instructions
+system.cpu0.num_store_insts                     10514                       # Number of store instructions
+system.cpu0.num_idle_cycles               5227.003992                       # Number of idle cycles
+system.cpu0.num_busy_cycles              1321681.996008                       # Number of busy cycles
+system.cpu0.not_idle_fraction                0.996061                       # Percentage of non-idle cycles
+system.cpu0.idle_fraction                    0.003939                       # Percentage of idle cycles
+system.cpu0.Branches                            16199                       # Number of branches fetched
+system.cpu0.op_class::No_OpClass                  615      0.45%      0.45% # Class of executed instruction
+system.cpu0.op_class::IntAlu                   108791     79.00%     79.45% # Class of executed instruction
+system.cpu0.op_class::IntMult                      13      0.01%     79.46% # Class of executed instruction
+system.cpu0.op_class::IntDiv                      138      0.10%     79.56% # Class of executed instruction
+system.cpu0.op_class::FloatAdd                    950      0.69%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatCmp                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatCvt                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatMult                     0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatDiv                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatSqrt                     0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdAdd                       0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdAddAcc                    0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdAlu                       0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdCmp                       0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdCvt                       0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdMisc                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdMult                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdMultAcc                   0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdShift                     0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdShiftAcc                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdSqrt                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatAdd                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatAlu                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatCmp                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatCvt                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatDiv                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatMisc                 0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatMult                 0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatMultAcc              0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatSqrt                 0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::MemRead                   16684     12.12%     92.36% # Class of executed instruction
+system.cpu0.op_class::MemWrite                  10514      7.64%    100.00% # Class of executed instruction
+system.cpu0.op_class::IprAccess                     0      0.00%    100.00% # Class of executed instruction
+system.cpu0.op_class::InstPrefetch                  0      0.00%    100.00% # Class of executed instruction
+system.cpu0.op_class::total                    137705                       # Class of executed instruction
+system.cpu1.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.cpu1.clk_domain.clock                     1000                       # Clock period in ticks
+system.cpu1.CUs0.wavefronts00.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts00.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts00.timesBlockedDueRAWDependencies          297                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::samples           39                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::mean     0.794872                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::stdev     0.863880                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::0-1           28     71.79%     71.79% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::2-3           11     28.21%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::total           39                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::samples           39                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::mean     0.589744                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::stdev     0.498310                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::0-1           39    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::total           39                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts01.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts01.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts02.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts02.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts03.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts03.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts04.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts04.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts05.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts05.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts06.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts06.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts07.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts07.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts08.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts08.timesBlockedDueRAWDependencies          273                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts09.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts09.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts10.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts10.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts11.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts11.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts12.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts12.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts13.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts13.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts14.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts14.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts15.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts15.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts16.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts16.timesBlockedDueRAWDependencies          272                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts17.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts17.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts18.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts18.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts19.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts19.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts20.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts20.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts21.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts21.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts22.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts22.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts23.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts23.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts24.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts24.timesBlockedDueRAWDependencies          256                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts25.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts25.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts26.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts26.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts27.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts27.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts28.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts28.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts29.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts29.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts30.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts30.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts31.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts31.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::samples           43                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::mean     5.813953                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::stdev     2.683777                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::underflows            0      0.00%      0.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::1            0      0.00%      0.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::2            8     18.60%     18.60% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::3            8     18.60%     37.21% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::4            1      2.33%     39.53% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::5            0      0.00%     39.53% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::6            1      2.33%     41.86% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::7            0      0.00%     41.86% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::8           25     58.14%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::9            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::10            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::11            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::12            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::13            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::14            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::15            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::16            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::17            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::18            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::19            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::20            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::21            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::22            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::23            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::24            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::25            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::26            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::27            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::28            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::29            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::30            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::31            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::32            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::overflows            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::min_value            2                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::max_value            8                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::total           43                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.ExecStage.num_cycles_with_no_issue         3230                       # number of cycles the CU issues nothing
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_issued          128                       # number of cycles the CU issued at least one instruction
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU0           30                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU1           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU2           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU3           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::GM           18                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::LM            6                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU0          780                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU1          367                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU2          384                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU3          327                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::GM          414                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::LM           30                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.spc::samples          3358                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::mean         0.041989                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::stdev        0.220406                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::underflows            0      0.00%      0.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::0                3230     96.19%     96.19% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::1                 116      3.45%     99.64% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::2                  11      0.33%     99.97% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::3                   1      0.03%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::4                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::5                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::6                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::overflows            0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::min_value            0                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::max_value            3                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::total            3358                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.num_transitions_active_to_idle           82                       # number of CU transitions from active to idle
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::samples           82                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::mean    39.280488                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::stdev   158.161058                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::underflows            0      0.00%      0.00% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::0-4           62     75.61%     75.61% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::5-9            9     10.98%     86.59% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::10-14            1      1.22%     87.80% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::15-19            0      0.00%     87.80% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::20-24            2      2.44%     90.24% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::25-29            1      1.22%     91.46% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::30-34            0      0.00%     91.46% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::35-39            0      0.00%     91.46% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::40-44            0      0.00%     91.46% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::45-49            0      0.00%     91.46% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::50-54            0      0.00%     91.46% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::55-59            0      0.00%     91.46% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::60-64            0      0.00%     91.46% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::65-69            0      0.00%     91.46% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::70-74            0      0.00%     91.46% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::75            0      0.00%     91.46% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::overflows            7      8.54%    100.00% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::min_value            1                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::max_value         1285                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::total           82                       # duration of idle periods in cycles
+system.cpu1.CUs0.GlobalMemPipeline.load_vrf_bank_conflict_cycles            0                       # total number of cycles GM data are delayed before updating the VRF
+system.cpu1.CUs0.LocalMemPipeline.load_vrf_bank_conflict_cycles            0                       # total number of cycles LDS data are delayed before updating the VRF
+system.cpu1.CUs0.tlb_requests                     769                       # number of uncoalesced requests
+system.cpu1.CUs0.tlb_cycles              -452460956000                       # total number of cycles for all uncoalesced requests
+system.cpu1.CUs0.avg_translation_latency -588375755.526658                       # Avg. translation latency for data translations
+system.cpu1.CUs0.TLB_hits_distribution::page_table          769                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs0.TLB_hits_distribution::L1_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs0.TLB_hits_distribution::L2_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs0.TLB_hits_distribution::L3_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs0.lds_bank_access_cnt               54                       # Total number of LDS bank accesses
+system.cpu1.CUs0.lds_bank_conflicts::samples            6                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::mean            8                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::stdev     6.196773                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::underflows            0      0.00%      0.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::0-1            2     33.33%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::2-3            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::4-5            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::6-7            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::8-9            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::10-11            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::12-13            4     66.67%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::14-15            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::16-17            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::18-19            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::20-21            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::22-23            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::24-25            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::26-27            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::28-29            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::30-31            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::32-33            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::34-35            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::36-37            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::38-39            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::40-41            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::42-43            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::44-45            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::46-47            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::48-49            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::50-51            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::52-53            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::54-55            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::56-57            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::58-59            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::60-61            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::62-63            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::64             0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::overflows            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::min_value            0                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::max_value           12                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::total            6                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.page_divergence_dist::samples           17                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::mean            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::stdev            0                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::underflows            0      0.00%      0.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::1-4           17    100.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::5-8            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::9-12            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::13-16            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::17-20            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::21-24            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::25-28            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::29-32            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::33-36            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::37-40            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::41-44            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::45-48            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::49-52            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::53-56            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::57-60            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::61-64            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::overflows            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::min_value            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::max_value            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::total           17                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.global_mem_instr_cnt              17                       # dynamic global memory instructions count
+system.cpu1.CUs0.local_mem_instr_cnt                6                       # dynamic local memory intruction count
+system.cpu1.CUs0.wg_blocked_due_lds_alloc            0                       # Workgroup blocked due to LDS capacity
+system.cpu1.CUs0.num_instr_executed               141                       # number of instructions executed
+system.cpu1.CUs0.inst_exec_rate::samples          141                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::mean       86.382979                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::stdev     229.391669                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::underflows            0      0.00%      0.00% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::0-1                1      0.71%      0.71% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::2-3               12      8.51%      9.22% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::4-5               51     36.17%     45.39% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::6-7               32     22.70%     68.09% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::8-9                2      1.42%     69.50% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::10                 2      1.42%     70.92% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::overflows           41     29.08%    100.00% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::min_value            1                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::max_value         1291                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::total            141                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.num_vec_ops_executed            6769                       # number of vec ops executed (e.g. VSZ/inst)
+system.cpu1.CUs0.num_total_cycles                3358                       # number of cycles the CU ran for
+system.cpu1.CUs0.vpc                         2.015783                       # Vector Operations per cycle (this CU only)
+system.cpu1.CUs0.ipc                         0.041989                       # Instructions per cycle (this CU only)
+system.cpu1.CUs0.warp_execution_dist::samples          141                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::mean    48.007092                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::stdev    23.719942                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::underflows            0      0.00%      0.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::1-4            5      3.55%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::5-8            0      0.00%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::9-12            0      0.00%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::13-16           36     25.53%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::17-20            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::21-24            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::25-28            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::29-32            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::33-36            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::37-40            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::41-44            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::45-48            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::49-52            8      5.67%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::53-56            0      0.00%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::57-60            0      0.00%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::61-64           92     65.25%    100.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::overflows            0      0.00%    100.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::min_value            1                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::max_value           64                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::total          141                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.gmem_lanes_execution_dist::samples           18                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::mean    37.833333                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::stdev    27.064737                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::underflows            0      0.00%      0.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::1-4            1      5.56%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::5-8            0      0.00%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::9-12            0      0.00%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::13-16            8     44.44%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::17-20            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::21-24            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::25-28            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::29-32            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::33-36            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::37-40            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::41-44            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::45-48            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::49-52            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::53-56            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::57-60            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::61-64            9     50.00%    100.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::overflows            0      0.00%    100.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::min_value            1                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::max_value           64                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::total           18                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::samples            6                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::mean    19.500000                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::stdev    22.322634                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::underflows            0      0.00%      0.00% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::1-4            1     16.67%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::5-8            0      0.00%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::9-12            0      0.00%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::13-16            4     66.67%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::17-20            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::21-24            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::25-28            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::29-32            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::33-36            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::37-40            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::41-44            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::45-48            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::49-52            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::53-56            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::57-60            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::61-64            1     16.67%    100.00% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::overflows            0      0.00%    100.00% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::min_value            1                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::max_value           64                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::total            6                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.num_alu_insts_executed           118                       # Number of dynamic non-GM memory insts executed
+system.cpu1.CUs0.times_wg_blocked_due_vgpr_alloc            0                       # Number of times WGs are blocked due to VGPR allocation per SIMD
+system.cpu1.CUs0.num_CAS_ops                        0                       # number of compare and swap operations
+system.cpu1.CUs0.num_failed_CAS_ops                 0                       # number of compare and swap operations that failed
+system.cpu1.CUs0.num_completed_wfs                  4                       # number of completed wavefronts
+system.cpu1.CUs1.wavefronts00.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts00.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts00.timesBlockedDueRAWDependencies          381                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::samples           39                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::mean     0.794872                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::stdev     0.863880                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::0-1           28     71.79%     71.79% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::2-3           11     28.21%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::total           39                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::samples           39                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::mean     0.589744                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::stdev     0.498310                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::0-1           39    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::total           39                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts01.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts01.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts02.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts02.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts03.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts03.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts04.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts04.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts05.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts05.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts06.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts06.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts07.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts07.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts08.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts08.timesBlockedDueRAWDependencies          356                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts09.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts09.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts10.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts10.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts11.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts11.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts12.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts12.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts13.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts13.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts14.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts14.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts15.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts15.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts16.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts16.timesBlockedDueRAWDependencies          356                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts17.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts17.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts18.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts18.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts19.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts19.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts20.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts20.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts21.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts21.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts22.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts22.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts23.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts23.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts24.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts24.timesBlockedDueRAWDependencies          339                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts25.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts25.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts26.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts26.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts27.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts27.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts28.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts28.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts29.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts29.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts30.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts30.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts31.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts31.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::samples           43                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::mean     5.813953                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::stdev     2.683777                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::underflows            0      0.00%      0.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::1            0      0.00%      0.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::2            8     18.60%     18.60% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::3            8     18.60%     37.21% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::4            1      2.33%     39.53% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::5            0      0.00%     39.53% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::6            1      2.33%     41.86% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::7            0      0.00%     41.86% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::8           25     58.14%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::9            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::10            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::11            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::12            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::13            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::14            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::15            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::16            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::17            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::18            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::19            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::20            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::21            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::22            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::23            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::24            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::25            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::26            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::27            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::28            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::29            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::30            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::31            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::32            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::overflows            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::min_value            2                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::max_value            8                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::total           43                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.ExecStage.num_cycles_with_no_issue         3228                       # number of cycles the CU issues nothing
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_issued          130                       # number of cycles the CU issued at least one instruction
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU0           30                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU1           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU2           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU3           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::GM           18                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::LM            6                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU0          778                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU1          472                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU2          447                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU3          411                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::GM          417                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::LM           26                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.spc::samples          3358                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::mean         0.041989                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::stdev        0.217686                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::underflows            0      0.00%      0.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::0                3228     96.13%     96.13% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::1                 120      3.57%     99.70% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::2                   9      0.27%     99.97% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::3                   1      0.03%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::4                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::5                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::6                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::overflows            0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::min_value            0                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::max_value            3                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::total            3358                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.num_transitions_active_to_idle           81                       # number of CU transitions from active to idle
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::samples           81                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::mean    38.617284                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::stdev   158.076213                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::underflows            0      0.00%      0.00% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::0-4           60     74.07%     74.07% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::5-9           10     12.35%     86.42% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::10-14            0      0.00%     86.42% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::15-19            2      2.47%     88.89% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::20-24            2      2.47%     91.36% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::25-29            0      0.00%     91.36% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::30-34            0      0.00%     91.36% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::35-39            0      0.00%     91.36% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::40-44            0      0.00%     91.36% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::45-49            0      0.00%     91.36% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::50-54            0      0.00%     91.36% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::55-59            0      0.00%     91.36% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::60-64            0      0.00%     91.36% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::65-69            0      0.00%     91.36% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::70-74            0      0.00%     91.36% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::75            0      0.00%     91.36% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::overflows            7      8.64%    100.00% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::min_value            1                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::max_value         1293                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::total           81                       # duration of idle periods in cycles
+system.cpu1.CUs1.GlobalMemPipeline.load_vrf_bank_conflict_cycles            0                       # total number of cycles GM data are delayed before updating the VRF
+system.cpu1.CUs1.LocalMemPipeline.load_vrf_bank_conflict_cycles            0                       # total number of cycles LDS data are delayed before updating the VRF
+system.cpu1.CUs1.tlb_requests                     769                       # number of uncoalesced requests
+system.cpu1.CUs1.tlb_cycles              -452466433000                       # total number of cycles for all uncoalesced requests
+system.cpu1.CUs1.avg_translation_latency -588382877.763329                       # Avg. translation latency for data translations
+system.cpu1.CUs1.TLB_hits_distribution::page_table          769                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs1.TLB_hits_distribution::L1_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs1.TLB_hits_distribution::L2_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs1.TLB_hits_distribution::L3_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs1.lds_bank_access_cnt               53                       # Total number of LDS bank accesses
+system.cpu1.CUs1.lds_bank_conflicts::samples            6                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::mean     7.833333                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::stdev     6.080022                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::underflows            0      0.00%      0.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::0-1            2     33.33%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::2-3            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::4-5            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::6-7            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::8-9            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::10-11            1     16.67%     50.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::12-13            3     50.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::14-15            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::16-17            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::18-19            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::20-21            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::22-23            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::24-25            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::26-27            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::28-29            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::30-31            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::32-33            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::34-35            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::36-37            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::38-39            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::40-41            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::42-43            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::44-45            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::46-47            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::48-49            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::50-51            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::52-53            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::54-55            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::56-57            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::58-59            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::60-61            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::62-63            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::64             0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::overflows            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::min_value            0                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::max_value           12                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::total            6                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.page_divergence_dist::samples           17                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::mean            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::stdev            0                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::underflows            0      0.00%      0.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::1-4           17    100.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::5-8            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::9-12            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::13-16            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::17-20            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::21-24            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::25-28            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::29-32            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::33-36            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::37-40            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::41-44            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::45-48            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::49-52            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::53-56            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::57-60            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::61-64            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::overflows            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::min_value            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::max_value            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::total           17                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.global_mem_instr_cnt              17                       # dynamic global memory instructions count
+system.cpu1.CUs1.local_mem_instr_cnt                6                       # dynamic local memory intruction count
+system.cpu1.CUs1.wg_blocked_due_lds_alloc            0                       # Workgroup blocked due to LDS capacity
+system.cpu1.CUs1.num_instr_executed               141                       # number of instructions executed
+system.cpu1.CUs1.inst_exec_rate::samples          141                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::mean       85.666667                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::stdev     230.212531                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::underflows            0      0.00%      0.00% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::0-1                1      0.71%      0.71% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::2-3               12      8.51%      9.22% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::4-5               52     36.88%     46.10% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::6-7               33     23.40%     69.50% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::8-9                4      2.84%     72.34% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::10                 1      0.71%     73.05% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::overflows           38     26.95%    100.00% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::min_value            1                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::max_value         1299                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::total            141                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.num_vec_ops_executed            6762                       # number of vec ops executed (e.g. VSZ/inst)
+system.cpu1.CUs1.num_total_cycles                3358                       # number of cycles the CU ran for
+system.cpu1.CUs1.vpc                         2.013699                       # Vector Operations per cycle (this CU only)
+system.cpu1.CUs1.ipc                         0.041989                       # Instructions per cycle (this CU only)
+system.cpu1.CUs1.warp_execution_dist::samples          141                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::mean    47.957447                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::stdev    23.818022                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::underflows            0      0.00%      0.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::1-4            5      3.55%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::5-8            0      0.00%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::9-12            9      6.38%      9.93% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::13-16           27     19.15%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::17-20            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::21-24            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::25-28            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::29-32            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::33-36            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::37-40            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::41-44            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::45-48            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::49-52            8      5.67%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::53-56            0      0.00%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::57-60            0      0.00%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::61-64           92     65.25%    100.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::overflows            0      0.00%    100.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::min_value            1                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::max_value           64                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::total          141                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.gmem_lanes_execution_dist::samples           18                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::mean    37.722222                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::stdev    27.174394                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::underflows            0      0.00%      0.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::1-4            1      5.56%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::5-8            0      0.00%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::9-12            2     11.11%     16.67% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::13-16            6     33.33%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::17-20            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::21-24            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::25-28            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::29-32            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::33-36            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::37-40            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::41-44            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::45-48            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::49-52            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::53-56            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::57-60            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::61-64            9     50.00%    100.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::overflows            0      0.00%    100.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::min_value            1                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::max_value           64                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::total           18                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::samples            6                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::mean    19.333333                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::stdev    22.384518                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::underflows            0      0.00%      0.00% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::1-4            1     16.67%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::5-8            0      0.00%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::9-12            1     16.67%     33.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::13-16            3     50.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::17-20            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::21-24            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::25-28            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::29-32            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::33-36            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::37-40            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::41-44            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::45-48            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::49-52            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::53-56            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::57-60            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::61-64            1     16.67%    100.00% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::overflows            0      0.00%    100.00% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::min_value            1                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::max_value           64                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::total            6                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.num_alu_insts_executed           118                       # Number of dynamic non-GM memory insts executed
+system.cpu1.CUs1.times_wg_blocked_due_vgpr_alloc            0                       # Number of times WGs are blocked due to VGPR allocation per SIMD
+system.cpu1.CUs1.num_CAS_ops                        0                       # number of compare and swap operations
+system.cpu1.CUs1.num_failed_CAS_ops                 0                       # number of compare and swap operations that failed
+system.cpu1.CUs1.num_completed_wfs                  4                       # number of completed wavefronts
+system.cpu2.num_kernel_launched                     1                       # number of kernel launched
+system.dir_cntrl0.L3CacheMemory.demand_hits            0                       # Number of cache demand hits
+system.dir_cntrl0.L3CacheMemory.demand_misses            0                       # Number of cache demand misses
+system.dir_cntrl0.L3CacheMemory.demand_accesses            0                       # Number of cache demand accesses
+system.dir_cntrl0.L3CacheMemory.num_data_array_writes         1551                       # number of data array writes
+system.dir_cntrl0.L3CacheMemory.num_tag_array_reads         1551                       # number of tag array reads
+system.dir_cntrl0.L3CacheMemory.num_tag_array_writes         1551                       # number of tag array writes
+system.dispatcher_coalescer.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.dispatcher_coalescer.clk_domain.clock         1000                       # Clock period in ticks
+system.dispatcher_coalescer.uncoalesced_accesses            0                       # Number of uncoalesced TLB accesses
+system.dispatcher_coalescer.coalesced_accesses            0                       # Number of coalesced TLB accesses
+system.dispatcher_coalescer.queuing_cycles            0                       # Number of cycles spent in queue
+system.dispatcher_coalescer.local_queuing_cycles            0                       # Number of cycles spent in queue for all incoming reqs
+system.dispatcher_coalescer.local_latency          nan                       # Avg. latency over all incoming pkts
+system.dispatcher_tlb.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.dispatcher_tlb.clk_domain.clock           1000                       # Clock period in ticks
+system.dispatcher_tlb.local_TLB_accesses            0                       # Number of TLB accesses
+system.dispatcher_tlb.local_TLB_hits                0                       # Number of TLB hits
+system.dispatcher_tlb.local_TLB_misses              0                       # Number of TLB misses
+system.dispatcher_tlb.local_TLB_miss_rate          nan                       # TLB miss rate
+system.dispatcher_tlb.global_TLB_accesses            0                       # Number of TLB accesses
+system.dispatcher_tlb.global_TLB_hits               0                       # Number of TLB hits
+system.dispatcher_tlb.global_TLB_misses             0                       # Number of TLB misses
+system.dispatcher_tlb.global_TLB_miss_rate          nan                       # TLB miss rate
+system.dispatcher_tlb.access_cycles                 0                       # Cycles spent accessing this TLB level
+system.dispatcher_tlb.page_table_cycles             0                       # Cycles spent accessing the page table
+system.dispatcher_tlb.unique_pages                  0                       # Number of unique pages touched
+system.dispatcher_tlb.local_cycles                  0                       # Number of cycles spent in queue for all incoming reqs
+system.dispatcher_tlb.local_latency               nan                       # Avg. latency over incoming coalesced reqs
+system.dispatcher_tlb.avg_reuse_distance            0                       # avg. reuse distance over all pages (in ticks)
+system.l1_coalescer0.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l1_coalescer0.clk_domain.clock            1000                       # Clock period in ticks
+system.l1_coalescer0.uncoalesced_accesses          778                       # Number of uncoalesced TLB accesses
+system.l1_coalescer0.coalesced_accesses             0                       # Number of coalesced TLB accesses
+system.l1_coalescer0.queuing_cycles                 0                       # Number of cycles spent in queue
+system.l1_coalescer0.local_queuing_cycles            0                       # Number of cycles spent in queue for all incoming reqs
+system.l1_coalescer0.local_latency                  0                       # Avg. latency over all incoming pkts
+system.l1_coalescer1.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l1_coalescer1.clk_domain.clock            1000                       # Clock period in ticks
+system.l1_coalescer1.uncoalesced_accesses          769                       # Number of uncoalesced TLB accesses
+system.l1_coalescer1.coalesced_accesses             0                       # Number of coalesced TLB accesses
+system.l1_coalescer1.queuing_cycles                 0                       # Number of cycles spent in queue
+system.l1_coalescer1.local_queuing_cycles            0                       # Number of cycles spent in queue for all incoming reqs
+system.l1_coalescer1.local_latency                  0                       # Avg. latency over all incoming pkts
+system.l1_tlb0.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l1_tlb0.clk_domain.clock                  1000                       # Clock period in ticks
+system.l1_tlb0.local_TLB_accesses                 778                       # Number of TLB accesses
+system.l1_tlb0.local_TLB_hits                     774                       # Number of TLB hits
+system.l1_tlb0.local_TLB_misses                     4                       # Number of TLB misses
+system.l1_tlb0.local_TLB_miss_rate           0.514139                       # TLB miss rate
+system.l1_tlb0.global_TLB_accesses                778                       # Number of TLB accesses
+system.l1_tlb0.global_TLB_hits                    774                       # Number of TLB hits
+system.l1_tlb0.global_TLB_misses                    4                       # Number of TLB misses
+system.l1_tlb0.global_TLB_miss_rate          0.514139                       # TLB miss rate
+system.l1_tlb0.access_cycles                        0                       # Cycles spent accessing this TLB level
+system.l1_tlb0.page_table_cycles                    0                       # Cycles spent accessing the page table
+system.l1_tlb0.unique_pages                         4                       # Number of unique pages touched
+system.l1_tlb0.local_cycles                         0                       # Number of cycles spent in queue for all incoming reqs
+system.l1_tlb0.local_latency                        0                       # Avg. latency over incoming coalesced reqs
+system.l1_tlb0.avg_reuse_distance                   0                       # avg. reuse distance over all pages (in ticks)
+system.l1_tlb1.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l1_tlb1.clk_domain.clock                  1000                       # Clock period in ticks
+system.l1_tlb1.local_TLB_accesses                 769                       # Number of TLB accesses
+system.l1_tlb1.local_TLB_hits                     766                       # Number of TLB hits
+system.l1_tlb1.local_TLB_misses                     3                       # Number of TLB misses
+system.l1_tlb1.local_TLB_miss_rate           0.390117                       # TLB miss rate
+system.l1_tlb1.global_TLB_accesses                769                       # Number of TLB accesses
+system.l1_tlb1.global_TLB_hits                    766                       # Number of TLB hits
+system.l1_tlb1.global_TLB_misses                    3                       # Number of TLB misses
+system.l1_tlb1.global_TLB_miss_rate          0.390117                       # TLB miss rate
+system.l1_tlb1.access_cycles                        0                       # Cycles spent accessing this TLB level
+system.l1_tlb1.page_table_cycles                    0                       # Cycles spent accessing the page table
+system.l1_tlb1.unique_pages                         3                       # Number of unique pages touched
+system.l1_tlb1.local_cycles                         0                       # Number of cycles spent in queue for all incoming reqs
+system.l1_tlb1.local_latency                        0                       # Avg. latency over incoming coalesced reqs
+system.l1_tlb1.avg_reuse_distance                   0                       # avg. reuse distance over all pages (in ticks)
+system.l2_coalescer.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l2_coalescer.clk_domain.clock             1000                       # Clock period in ticks
+system.l2_coalescer.uncoalesced_accesses            8                       # Number of uncoalesced TLB accesses
+system.l2_coalescer.coalesced_accesses              1                       # Number of coalesced TLB accesses
+system.l2_coalescer.queuing_cycles               8000                       # Number of cycles spent in queue
+system.l2_coalescer.local_queuing_cycles         1000                       # Number of cycles spent in queue for all incoming reqs
+system.l2_coalescer.local_latency                 125                       # Avg. latency over all incoming pkts
+system.l2_tlb.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l2_tlb.clk_domain.clock                   1000                       # Clock period in ticks
+system.l2_tlb.local_TLB_accesses                    8                       # Number of TLB accesses
+system.l2_tlb.local_TLB_hits                        3                       # Number of TLB hits
+system.l2_tlb.local_TLB_misses                      5                       # Number of TLB misses
+system.l2_tlb.local_TLB_miss_rate           62.500000                       # TLB miss rate
+system.l2_tlb.global_TLB_accesses                  15                       # Number of TLB accesses
+system.l2_tlb.global_TLB_hits                       3                       # Number of TLB hits
+system.l2_tlb.global_TLB_misses                    12                       # Number of TLB misses
+system.l2_tlb.global_TLB_miss_rate                 80                       # TLB miss rate
+system.l2_tlb.access_cycles                    552008                       # Cycles spent accessing this TLB level
+system.l2_tlb.page_table_cycles                     0                       # Cycles spent accessing the page table
+system.l2_tlb.unique_pages                          5                       # Number of unique pages touched
+system.l2_tlb.local_cycles                      69001                       # Number of cycles spent in queue for all incoming reqs
+system.l2_tlb.local_latency               8625.125000                       # Avg. latency over incoming coalesced reqs
+system.l2_tlb.avg_reuse_distance                    0                       # avg. reuse distance over all pages (in ticks)
+system.l3_coalescer.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l3_coalescer.clk_domain.clock             1000                       # Clock period in ticks
+system.l3_coalescer.uncoalesced_accesses            5                       # Number of uncoalesced TLB accesses
+system.l3_coalescer.coalesced_accesses              1                       # Number of coalesced TLB accesses
+system.l3_coalescer.queuing_cycles               8000                       # Number of cycles spent in queue
+system.l3_coalescer.local_queuing_cycles         1000                       # Number of cycles spent in queue for all incoming reqs
+system.l3_coalescer.local_latency                 200                       # Avg. latency over all incoming pkts
+system.l3_tlb.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l3_tlb.clk_domain.clock                   1000                       # Clock period in ticks
+system.l3_tlb.local_TLB_accesses                    5                       # Number of TLB accesses
+system.l3_tlb.local_TLB_hits                        0                       # Number of TLB hits
+system.l3_tlb.local_TLB_misses                      5                       # Number of TLB misses
+system.l3_tlb.local_TLB_miss_rate                 100                       # TLB miss rate
+system.l3_tlb.global_TLB_accesses                  12                       # Number of TLB accesses
+system.l3_tlb.global_TLB_hits                       0                       # Number of TLB hits
+system.l3_tlb.global_TLB_misses                    12                       # Number of TLB misses
+system.l3_tlb.global_TLB_miss_rate                100                       # TLB miss rate
+system.l3_tlb.access_cycles                   1200000                       # Cycles spent accessing this TLB level
+system.l3_tlb.page_table_cycles               6000000                       # Cycles spent accessing the page table
+system.l3_tlb.unique_pages                          5                       # Number of unique pages touched
+system.l3_tlb.local_cycles                     150000                       # Number of cycles spent in queue for all incoming reqs
+system.l3_tlb.local_latency                     30000                       # Avg. latency over incoming coalesced reqs
+system.l3_tlb.avg_reuse_distance                    0                       # avg. reuse distance over all pages (in ticks)
+system.piobus.trans_dist::WriteReq                 94                       # Transaction distribution
+system.piobus.trans_dist::WriteResp                94                       # Transaction distribution
+system.piobus.pkt_count_system.cp_cntrl0.sequencer.mem-master-port::system.cpu2.pio          188                       # Packet count per connected master and slave (bytes)
+system.piobus.pkt_count::total                    188                       # Packet count per connected master and slave (bytes)
+system.piobus.pkt_size_system.cp_cntrl0.sequencer.mem-master-port::system.cpu2.pio          748                       # Cumulative packet size per connected master and slave (bytes)
+system.piobus.pkt_size::total                     748                       # Cumulative packet size per connected master and slave (bytes)
+system.piobus.reqLayer0.occupancy              188000                       # Layer occupancy (ticks)
+system.piobus.reqLayer0.utilization               0.0                       # Layer utilization (%)
+system.piobus.respLayer0.occupancy              94000                       # Layer occupancy (ticks)
+system.piobus.respLayer0.utilization              0.0                       # Layer utilization (%)
+system.ruby.network.ext_links0.int_node.percent_links_utilized     0.007952                      
+system.ruby.network.ext_links0.int_node.msg_count.Control::0         1551                      
+system.ruby.network.ext_links0.int_node.msg_count.Request_Control::0         1551                      
+system.ruby.network.ext_links0.int_node.msg_count.Response_Data::2         1563                      
+system.ruby.network.ext_links0.int_node.msg_count.Response_Control::2         1539                      
+system.ruby.network.ext_links0.int_node.msg_count.Unblock_Control::4         1551                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Control::0        12408                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Request_Control::0        12408                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Response_Data::2       112536                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Response_Control::2        12312                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Unblock_Control::4        12408                      
+system.ruby.network.ext_links1.int_node.percent_links_utilized     0.009970                      
+system.ruby.network.ext_links1.int_node.msg_count.Control::0           16                      
+system.ruby.network.ext_links1.int_node.msg_count.Request_Control::0         1535                      
+system.ruby.network.ext_links1.int_node.msg_count.Response_Data::2         1537                      
+system.ruby.network.ext_links1.int_node.msg_count.Response_Control::2           14                      
+system.ruby.network.ext_links1.int_node.msg_count.Unblock_Control::4         1535                      
+system.ruby.network.ext_links1.int_node.msg_bytes.Control::0          128                      
+system.ruby.network.ext_links1.int_node.msg_bytes.Request_Control::0        12280                      
+system.ruby.network.ext_links1.int_node.msg_bytes.Response_Data::2       110664                      
+system.ruby.network.ext_links1.int_node.msg_bytes.Response_Control::2          112                      
+system.ruby.network.ext_links1.int_node.msg_bytes.Unblock_Control::4        12280                      
+system.tcp_cntrl0.L1cache.demand_hits               0                       # Number of cache demand hits
+system.tcp_cntrl0.L1cache.demand_misses             0                       # Number of cache demand misses
+system.tcp_cntrl0.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.tcp_cntrl0.L1cache.num_data_array_reads           10                       # number of data array reads
+system.tcp_cntrl0.L1cache.num_data_array_writes           11                       # number of data array writes
+system.tcp_cntrl0.L1cache.num_tag_array_reads           27                       # number of tag array reads
+system.tcp_cntrl0.L1cache.num_tag_array_writes           18                       # number of tag array writes
+system.tcp_cntrl0.L1cache.num_tag_array_stalls            2                       # number of stalls caused by tag array
+system.tcp_cntrl0.L1cache.num_data_array_stalls            2                       # number of stalls caused by data array
+system.tcp_cntrl0.coalescer.gpu_tcp_ld_hits            3                       # loads that hit in the TCP
+system.tcp_cntrl0.coalescer.gpu_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl0.coalescer.gpu_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl0.coalescer.gpu_ld_misses            2                       # loads that miss in the GPU
+system.tcp_cntrl0.coalescer.gpu_tcp_st_hits            4                       # stores that hit in the TCP
+system.tcp_cntrl0.coalescer.gpu_tcp_st_transfers            1                       # TCP to TCP store transfers
+system.tcp_cntrl0.coalescer.gpu_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl0.coalescer.gpu_st_misses            4                       # stores that miss in the GPU
+system.tcp_cntrl0.coalescer.cp_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl0.coalescer.cp_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl0.coalescer.cp_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl0.coalescer.cp_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl0.coalescer.cp_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl0.coalescer.cp_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl0.coalescer.cp_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl0.coalescer.cp_st_misses            0                       # stores that miss in the GPU
+system.ruby.network.ext_links2.int_node.percent_links_utilized     0.000721                      
+system.ruby.network.ext_links2.int_node.msg_count.Control::0         1535                      
+system.ruby.network.ext_links2.int_node.msg_count.Control::1           14                      
+system.ruby.network.ext_links2.int_node.msg_count.Request_Control::0           16                      
+system.ruby.network.ext_links2.int_node.msg_count.Request_Control::1           19                      
+system.ruby.network.ext_links2.int_node.msg_count.Response_Data::2           26                      
+system.ruby.network.ext_links2.int_node.msg_count.Response_Data::3           33                      
+system.ruby.network.ext_links2.int_node.msg_count.Response_Control::2         1525                      
+system.ruby.network.ext_links2.int_node.msg_count.Unblock_Control::4           16                      
+system.ruby.network.ext_links2.int_node.msg_count.Unblock_Control::5           19                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Control::0        12280                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Control::1          112                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::0          128                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::1          152                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Response_Data::2         1872                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Response_Data::3         2376                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Response_Control::2        12200                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Unblock_Control::4          128                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Unblock_Control::5          152                      
+system.tcp_cntrl1.L1cache.demand_hits               0                       # Number of cache demand hits
+system.tcp_cntrl1.L1cache.demand_misses             0                       # Number of cache demand misses
+system.tcp_cntrl1.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.tcp_cntrl1.L1cache.num_data_array_reads            7                       # number of data array reads
+system.tcp_cntrl1.L1cache.num_data_array_writes           11                       # number of data array writes
+system.tcp_cntrl1.L1cache.num_tag_array_reads           25                       # number of tag array reads
+system.tcp_cntrl1.L1cache.num_tag_array_writes           18                       # number of tag array writes
+system.tcp_cntrl1.L1cache.num_tag_array_stalls            2                       # number of stalls caused by tag array
+system.tcp_cntrl1.L1cache.num_data_array_stalls            2                       # number of stalls caused by data array
+system.tcp_cntrl1.coalescer.gpu_tcp_ld_hits            3                       # loads that hit in the TCP
+system.tcp_cntrl1.coalescer.gpu_tcp_ld_transfers            2                       # TCP to TCP load transfers
+system.tcp_cntrl1.coalescer.gpu_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl1.coalescer.gpu_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl1.coalescer.gpu_tcp_st_hits            4                       # stores that hit in the TCP
+system.tcp_cntrl1.coalescer.gpu_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl1.coalescer.gpu_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl1.coalescer.gpu_st_misses            5                       # stores that miss in the GPU
+system.tcp_cntrl1.coalescer.cp_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl1.coalescer.cp_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl1.coalescer.cp_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl1.coalescer.cp_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl1.coalescer.cp_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl1.coalescer.cp_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl1.coalescer.cp_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl1.coalescer.cp_st_misses            0                       # stores that miss in the GPU
+system.sqc_cntrl0.L1cache.demand_hits               0                       # Number of cache demand hits
+system.sqc_cntrl0.L1cache.demand_misses             0                       # Number of cache demand misses
+system.sqc_cntrl0.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.sqc_cntrl0.L1cache.num_data_array_reads           86                       # number of data array reads
+system.sqc_cntrl0.L1cache.num_data_array_writes            5                       # number of data array writes
+system.sqc_cntrl0.L1cache.num_tag_array_reads           86                       # number of tag array reads
+system.sqc_cntrl0.L1cache.num_tag_array_writes            5                       # number of tag array writes
+system.sqc_cntrl0.L1cache.num_data_array_stalls           44                       # number of stalls caused by data array
+system.sqc_cntrl0.sequencer.load_waiting_on_load          120                       # Number of times a load aliased with a pending load
+system.tcc_cntrl0.L2cache.demand_hits               0                       # Number of cache demand hits
+system.tcc_cntrl0.L2cache.demand_misses             0                       # Number of cache demand misses
+system.tcc_cntrl0.L2cache.demand_accesses            0                       # Number of cache demand accesses
+system.tccdir_cntrl0.directory.demand_hits            0                       # Number of cache demand hits
+system.tccdir_cntrl0.directory.demand_misses            0                       # Number of cache demand misses
+system.tccdir_cntrl0.directory.demand_accesses            0                       # Number of cache demand accesses
+system.tccdir_cntrl0.directory.num_tag_array_reads         1554                       # number of tag array reads
+system.tccdir_cntrl0.directory.num_tag_array_writes           27                       # number of tag array writes
+system.ruby.network.msg_count.Control            3116                      
+system.ruby.network.msg_count.Request_Control         3121                      
+system.ruby.network.msg_count.Response_Data         3159                      
+system.ruby.network.msg_count.Response_Control         3078                      
+system.ruby.network.msg_count.Unblock_Control         3121                      
+system.ruby.network.msg_byte.Control            24928                      
+system.ruby.network.msg_byte.Request_Control        24968                      
+system.ruby.network.msg_byte.Response_Data       227448                      
+system.ruby.network.msg_byte.Response_Control        24624                      
+system.ruby.network.msg_byte.Unblock_Control        24968                      
+system.sqc_coalescer.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.sqc_coalescer.clk_domain.clock            1000                       # Clock period in ticks
+system.sqc_coalescer.uncoalesced_accesses           86                       # Number of uncoalesced TLB accesses
+system.sqc_coalescer.coalesced_accesses            63                       # Number of coalesced TLB accesses
+system.sqc_coalescer.queuing_cycles            100000                       # Number of cycles spent in queue
+system.sqc_coalescer.local_queuing_cycles       100000                       # Number of cycles spent in queue for all incoming reqs
+system.sqc_coalescer.local_latency        1162.790698                       # Avg. latency over all incoming pkts
+system.sqc_tlb.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.sqc_tlb.clk_domain.clock                  1000                       # Clock period in ticks
+system.sqc_tlb.local_TLB_accesses                  63                       # Number of TLB accesses
+system.sqc_tlb.local_TLB_hits                      62                       # Number of TLB hits
+system.sqc_tlb.local_TLB_misses                     1                       # Number of TLB misses
+system.sqc_tlb.local_TLB_miss_rate           1.587302                       # TLB miss rate
+system.sqc_tlb.global_TLB_accesses                 86                       # Number of TLB accesses
+system.sqc_tlb.global_TLB_hits                     78                       # Number of TLB hits
+system.sqc_tlb.global_TLB_misses                    8                       # Number of TLB misses
+system.sqc_tlb.global_TLB_miss_rate          9.302326                       # TLB miss rate
+system.sqc_tlb.access_cycles                    86008                       # Cycles spent accessing this TLB level
+system.sqc_tlb.page_table_cycles                    0                       # Cycles spent accessing the page table
+system.sqc_tlb.unique_pages                         1                       # Number of unique pages touched
+system.sqc_tlb.local_cycles                     63001                       # Number of cycles spent in queue for all incoming reqs
+system.sqc_tlb.local_latency              1000.015873                       # Avg. latency over incoming coalesced reqs
+system.sqc_tlb.avg_reuse_distance                   0                       # avg. reuse distance over all pages (in ticks)
+system.ruby.network.ext_links0.int_node.throttle0.link_utilization     0.005592                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Request_Control::0         1551                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Response_Data::2           12                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Response_Control::2         1539                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Unblock_Control::4         1551                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Request_Control::0        12408                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Response_Data::2          864                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Response_Control::2        12312                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Unblock_Control::4        12408                      
+system.ruby.network.ext_links0.int_node.throttle1.link_utilization     0.016287                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_count.Control::0           16                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_count.Response_Data::2         1535                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Control::0          128                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Response_Data::2       110520                      
+system.ruby.network.ext_links0.int_node.throttle2.link_utilization     0.001977                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_count.Control::0         1535                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_count.Response_Data::2           16                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Control::0        12280                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Response_Data::2         1152                      
+system.ruby.network.ext_links1.int_node.throttle0.link_utilization     0.016287                      
+system.ruby.network.ext_links1.int_node.throttle0.msg_count.Control::0           16                      
+system.ruby.network.ext_links1.int_node.throttle0.msg_count.Response_Data::2         1535                      
+system.ruby.network.ext_links1.int_node.throttle0.msg_bytes.Control::0          128                      
+system.ruby.network.ext_links1.int_node.throttle0.msg_bytes.Response_Data::2       110520                      
+system.ruby.network.ext_links1.int_node.throttle1.link_utilization     0.003653                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_count.Request_Control::0         1535                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_count.Response_Data::2            2                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_count.Response_Control::2           14                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_count.Unblock_Control::4         1535                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Request_Control::0        12280                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Response_Data::2          144                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Response_Control::2          112                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Unblock_Control::4        12280                      
+system.ruby.network.ext_links2.int_node.throttle0.link_utilization     0.000084                      
+system.ruby.network.ext_links2.int_node.throttle0.msg_count.Control::1            8                      
+system.ruby.network.ext_links2.int_node.throttle0.msg_count.Response_Data::3            7                      
+system.ruby.network.ext_links2.int_node.throttle0.msg_bytes.Control::1           64                      
+system.ruby.network.ext_links2.int_node.throttle0.msg_bytes.Response_Data::3          504                      
+system.ruby.network.ext_links2.int_node.throttle1.link_utilization     0.000081                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_count.Control::1            6                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_count.Response_Data::3            7                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Control::1           48                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Response_Data::3          504                      
+system.ruby.network.ext_links2.int_node.throttle2.link_utilization            0                      
+system.ruby.network.ext_links2.int_node.throttle3.link_utilization     0.002170                      
+system.ruby.network.ext_links2.int_node.throttle3.msg_count.Control::0         1535                      
+system.ruby.network.ext_links2.int_node.throttle3.msg_count.Request_Control::1           19                      
+system.ruby.network.ext_links2.int_node.throttle3.msg_count.Response_Data::2           16                      
+system.ruby.network.ext_links2.int_node.throttle3.msg_count.Response_Data::3           14                      
+system.ruby.network.ext_links2.int_node.throttle3.msg_count.Unblock_Control::5           19                      
+system.ruby.network.ext_links2.int_node.throttle3.msg_bytes.Control::0        12280                      
+system.ruby.network.ext_links2.int_node.throttle3.msg_bytes.Request_Control::1          152                      
+system.ruby.network.ext_links2.int_node.throttle3.msg_bytes.Response_Data::2         1152                      
+system.ruby.network.ext_links2.int_node.throttle3.msg_bytes.Response_Data::3         1008                      
+system.ruby.network.ext_links2.int_node.throttle3.msg_bytes.Unblock_Control::5          152                      
+system.ruby.network.ext_links2.int_node.throttle4.link_utilization     0.000053                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_count.Response_Data::3            5                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Response_Data::3          360                      
+system.ruby.network.ext_links2.int_node.throttle5.link_utilization     0.001939                      
+system.ruby.network.ext_links2.int_node.throttle5.msg_count.Request_Control::0           16                      
+system.ruby.network.ext_links2.int_node.throttle5.msg_count.Response_Data::2           10                      
+system.ruby.network.ext_links2.int_node.throttle5.msg_count.Response_Control::2         1525                      
+system.ruby.network.ext_links2.int_node.throttle5.msg_count.Unblock_Control::4           16                      
+system.ruby.network.ext_links2.int_node.throttle5.msg_bytes.Request_Control::0          128                      
+system.ruby.network.ext_links2.int_node.throttle5.msg_bytes.Response_Data::2          720                      
+system.ruby.network.ext_links2.int_node.throttle5.msg_bytes.Response_Control::2        12200                      
+system.ruby.network.ext_links2.int_node.throttle5.msg_bytes.Unblock_Control::4          128                      
+system.ruby.CorePair_Controller.C0_Load_L1miss          180      0.00%      0.00%
+system.ruby.CorePair_Controller.C0_Load_L1hit        16155      0.00%      0.00%
+system.ruby.CorePair_Controller.Ifetch0_L1hit        86007      0.00%      0.00%
+system.ruby.CorePair_Controller.Ifetch0_L1miss         1088      0.00%      0.00%
+system.ruby.CorePair_Controller.C0_Store_L1miss          325      0.00%      0.00%
+system.ruby.CorePair_Controller.C0_Store_L1hit        10448      0.00%      0.00%
+system.ruby.CorePair_Controller.NB_AckS          1043      0.00%      0.00%
+system.ruby.CorePair_Controller.NB_AckM           326      0.00%      0.00%
+system.ruby.CorePair_Controller.NB_AckE           166      0.00%      0.00%
+system.ruby.CorePair_Controller.L1I_Repl          589      0.00%      0.00%
+system.ruby.CorePair_Controller.L1D0_Repl           24      0.00%      0.00%
+system.ruby.CorePair_Controller.L2_to_L1D0            5      0.00%      0.00%
+system.ruby.CorePair_Controller.L2_to_L1I           54      0.00%      0.00%
+system.ruby.CorePair_Controller.PrbInvData            9      0.00%      0.00%
+system.ruby.CorePair_Controller.PrbShrData            7      0.00%      0.00%
+system.ruby.CorePair_Controller.I.C0_Load_L1miss          175      0.00%      0.00%
+system.ruby.CorePair_Controller.I.Ifetch0_L1miss         1034      0.00%      0.00%
+system.ruby.CorePair_Controller.I.C0_Store_L1miss          325      0.00%      0.00%
+system.ruby.CorePair_Controller.I.PrbInvData            8      0.00%      0.00%
+system.ruby.CorePair_Controller.I.PrbShrData            5      0.00%      0.00%
+system.ruby.CorePair_Controller.S.C0_Load_L1hit          635      0.00%      0.00%
+system.ruby.CorePair_Controller.S.Ifetch0_L1hit        86007      0.00%      0.00%
+system.ruby.CorePair_Controller.S.Ifetch0_L1miss           54      0.00%      0.00%
+system.ruby.CorePair_Controller.S.L1I_Repl          589      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.C0_Load_L1miss            2      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.C0_Load_L1hit         2721      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.C0_Store_L1hit           46      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.L1D0_Repl           16      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.PrbShrData            1      0.00%      0.00%
+system.ruby.CorePair_Controller.O.C0_Load_L1hit            3      0.00%      0.00%
+system.ruby.CorePair_Controller.O.C0_Store_L1hit            1      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.C0_Load_L1miss            3      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.C0_Load_L1hit        12796      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.C0_Store_L1hit        10401      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.L1D0_Repl            8      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.PrbInvData            1      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.PrbShrData            1      0.00%      0.00%
+system.ruby.CorePair_Controller.I_M0.NB_AckM          325      0.00%      0.00%
+system.ruby.CorePair_Controller.I_E0S.NB_AckS            9      0.00%      0.00%
+system.ruby.CorePair_Controller.I_E0S.NB_AckE          166      0.00%      0.00%
+system.ruby.CorePair_Controller.Si_F0.L2_to_L1I           54      0.00%      0.00%
+system.ruby.CorePair_Controller.O_M0.NB_AckM            1      0.00%      0.00%
+system.ruby.CorePair_Controller.S0.NB_AckS         1034      0.00%      0.00%
+system.ruby.CorePair_Controller.E0_F.L2_to_L1D0            2      0.00%      0.00%
+system.ruby.CorePair_Controller.M0_F.L2_to_L1D0            3      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlkS          1039      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlkM           335      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlk            177      0.00%      0.00%
+system.ruby.Directory_Controller.CPUPrbResp         1551      0.00%      0.00%
+system.ruby.Directory_Controller.ProbeAcksComplete         1551      0.00%      0.00%
+system.ruby.Directory_Controller.MemData         1551      0.00%      0.00%
+system.ruby.Directory_Controller.CoreUnblock         1551      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlkS         1039      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlkM          335      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlk          177      0.00%      0.00%
+system.ruby.Directory_Controller.BS_M.MemData           29      0.00%      0.00%
+system.ruby.Directory_Controller.BM_M.MemData           12      0.00%      0.00%
+system.ruby.Directory_Controller.B_M.MemData            1      0.00%      0.00%
+system.ruby.Directory_Controller.BS_PM.CPUPrbResp           29      0.00%      0.00%
+system.ruby.Directory_Controller.BS_PM.ProbeAcksComplete           29      0.00%      0.00%
+system.ruby.Directory_Controller.BS_PM.MemData         1010      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.CPUPrbResp           12      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.ProbeAcksComplete           12      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.MemData          323      0.00%      0.00%
+system.ruby.Directory_Controller.B_PM.CPUPrbResp            1      0.00%      0.00%
+system.ruby.Directory_Controller.B_PM.ProbeAcksComplete            1      0.00%      0.00%
+system.ruby.Directory_Controller.B_PM.MemData          176      0.00%      0.00%
+system.ruby.Directory_Controller.BS_Pm.CPUPrbResp         1010      0.00%      0.00%
+system.ruby.Directory_Controller.BS_Pm.ProbeAcksComplete         1010      0.00%      0.00%
+system.ruby.Directory_Controller.BM_Pm.CPUPrbResp          323      0.00%      0.00%
+system.ruby.Directory_Controller.BM_Pm.ProbeAcksComplete          323      0.00%      0.00%
+system.ruby.Directory_Controller.B_Pm.CPUPrbResp          176      0.00%      0.00%
+system.ruby.Directory_Controller.B_Pm.ProbeAcksComplete          176      0.00%      0.00%
+system.ruby.Directory_Controller.B.CoreUnblock         1551      0.00%      0.00%
+system.ruby.LD.latency_hist::bucket_size           32                      
+system.ruby.LD.latency_hist::max_bucket           319                      
+system.ruby.LD.latency_hist::samples            16335                      
+system.ruby.LD.latency_hist::mean            4.217447                      
+system.ruby.LD.latency_hist::gmean           2.103537                      
+system.ruby.LD.latency_hist::stdev          21.286370                      
+system.ruby.LD.latency_hist              |       16160     98.93%     98.93% |           0      0.00%     98.93% |           0      0.00%     98.93% |           0      0.00%     98.93% |           0      0.00%     98.93% |           0      0.00%     98.93% |         166      1.02%     99.94% |           9      0.06%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.latency_hist::total              16335                      
+system.ruby.LD.hit_latency_hist::bucket_size           32                      
+system.ruby.LD.hit_latency_hist::max_bucket          319                      
+system.ruby.LD.hit_latency_hist::samples          175                      
+system.ruby.LD.hit_latency_hist::mean      208.468571                      
+system.ruby.LD.hit_latency_hist::gmean     208.231054                      
+system.ruby.LD.hit_latency_hist::stdev      10.632194                      
+system.ruby.LD.hit_latency_hist          |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |         166     94.86%     94.86% |           9      5.14%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.hit_latency_hist::total            175                      
+system.ruby.LD.miss_latency_hist::bucket_size            4                      
+system.ruby.LD.miss_latency_hist::max_bucket           39                      
+system.ruby.LD.miss_latency_hist::samples        16160                      
+system.ruby.LD.miss_latency_hist::mean       2.005569                      
+system.ruby.LD.miss_latency_hist::gmean      2.001425                      
+system.ruby.LD.miss_latency_hist::stdev      0.316580                      
+system.ruby.LD.miss_latency_hist         |       16155     99.97%     99.97% |           0      0.00%     99.97% |           0      0.00%     99.97% |           0      0.00%     99.97% |           0      0.00%     99.97% |           5      0.03%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.miss_latency_hist::total         16160                      
+system.ruby.ST.latency_hist::bucket_size           64                      
+system.ruby.ST.latency_hist::max_bucket           639                      
+system.ruby.ST.latency_hist::samples            10412                      
+system.ruby.ST.latency_hist::mean            8.385709                      
+system.ruby.ST.latency_hist::gmean           2.308923                      
+system.ruby.ST.latency_hist::stdev          35.862445                      
+system.ruby.ST.latency_hist              |       10090     96.91%     96.91% |           0      0.00%     96.91% |           0      0.00%     96.91% |         316      3.03%     99.94% |           3      0.03%     99.97% |           3      0.03%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.latency_hist::total              10412                      
+system.ruby.ST.hit_latency_hist::bucket_size           64                      
+system.ruby.ST.hit_latency_hist::max_bucket          639                      
+system.ruby.ST.hit_latency_hist::samples          322                      
+system.ruby.ST.hit_latency_hist::mean      208.484472                      
+system.ruby.ST.hit_latency_hist::gmean     208.014366                      
+system.ruby.ST.hit_latency_hist::stdev      16.327683                      
+system.ruby.ST.hit_latency_hist          |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |         316     98.14%     98.14% |           3      0.93%     99.07% |           3      0.93%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.hit_latency_hist::total            322                      
+system.ruby.ST.miss_latency_hist::bucket_size            1                      
+system.ruby.ST.miss_latency_hist::max_bucket            9                      
+system.ruby.ST.miss_latency_hist::samples        10090                      
+system.ruby.ST.miss_latency_hist::mean              2                      
+system.ruby.ST.miss_latency_hist::gmean      2.000000                      
+system.ruby.ST.miss_latency_hist         |           0      0.00%      0.00% |           0      0.00%      0.00% |       10090    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.miss_latency_hist::total         10090                      
+system.ruby.IFETCH.latency_hist::bucket_size           64                      
+system.ruby.IFETCH.latency_hist::max_bucket          639                      
+system.ruby.IFETCH.latency_hist::samples        87095                      
+system.ruby.IFETCH.latency_hist::mean        4.462093                      
+system.ruby.IFETCH.latency_hist::gmean       2.116390                      
+system.ruby.IFETCH.latency_hist::stdev      22.435279                      
+system.ruby.IFETCH.latency_hist          |       86061     98.81%     98.81% |           0      0.00%     98.81% |           0      0.00%     98.81% |        1011      1.16%     99.97% |          16      0.02%     99.99% |           7      0.01%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.latency_hist::total          87095                      
+system.ruby.IFETCH.hit_latency_hist::bucket_size           64                      
+system.ruby.IFETCH.hit_latency_hist::max_bucket          639                      
+system.ruby.IFETCH.hit_latency_hist::samples         1034                      
+system.ruby.IFETCH.hit_latency_hist::mean   208.444874                      
+system.ruby.IFETCH.hit_latency_hist::gmean   207.968565                      
+system.ruby.IFETCH.hit_latency_hist::stdev    16.462617                      
+system.ruby.IFETCH.hit_latency_hist      |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |        1011     97.78%     97.78% |          16      1.55%     99.32% |           7      0.68%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.hit_latency_hist::total         1034                      
+system.ruby.IFETCH.miss_latency_hist::bucket_size            4                      
+system.ruby.IFETCH.miss_latency_hist::max_bucket           39                      
+system.ruby.IFETCH.miss_latency_hist::samples        86061                      
+system.ruby.IFETCH.miss_latency_hist::mean     2.011294                      
+system.ruby.IFETCH.miss_latency_hist::gmean     2.002892                      
+system.ruby.IFETCH.miss_latency_hist::stdev     0.450747                      
+system.ruby.IFETCH.miss_latency_hist     |       86007     99.94%     99.94% |           0      0.00%     99.94% |           0      0.00%     99.94% |           0      0.00%     99.94% |           0      0.00%     99.94% |          54      0.06%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.miss_latency_hist::total        86061                      
+system.ruby.RMW_Read.latency_hist::bucket_size           32                      
+system.ruby.RMW_Read.latency_hist::max_bucket          319                      
+system.ruby.RMW_Read.latency_hist::samples          341                      
+system.ruby.RMW_Read.latency_hist::mean      4.392962                      
+system.ruby.RMW_Read.latency_hist::gmean     2.111743                      
+system.ruby.RMW_Read.latency_hist::stdev    21.996747                      
+system.ruby.RMW_Read.latency_hist        |         337     98.83%     98.83% |           0      0.00%     98.83% |           0      0.00%     98.83% |           0      0.00%     98.83% |           0      0.00%     98.83% |           0      0.00%     98.83% |           4      1.17%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RMW_Read.latency_hist::total          341                      
+system.ruby.RMW_Read.hit_latency_hist::bucket_size           32                      
+system.ruby.RMW_Read.hit_latency_hist::max_bucket          319                      
+system.ruby.RMW_Read.hit_latency_hist::samples            4                      
+system.ruby.RMW_Read.hit_latency_hist::mean          206                      
+system.ruby.RMW_Read.hit_latency_hist::gmean   206.000000                      
+system.ruby.RMW_Read.hit_latency_hist    |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           4    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RMW_Read.hit_latency_hist::total            4                      
+system.ruby.RMW_Read.miss_latency_hist::bucket_size            1                      
+system.ruby.RMW_Read.miss_latency_hist::max_bucket            9                      
+system.ruby.RMW_Read.miss_latency_hist::samples          337                      
+system.ruby.RMW_Read.miss_latency_hist::mean            2                      
+system.ruby.RMW_Read.miss_latency_hist::gmean     2.000000                      
+system.ruby.RMW_Read.miss_latency_hist   |           0      0.00%      0.00% |           0      0.00%      0.00% |         337    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RMW_Read.miss_latency_hist::total          337                      
+system.ruby.Locked_RMW_Read.latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Read.latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Read.latency_hist::samples           10                      
+system.ruby.Locked_RMW_Read.latency_hist::mean            2                      
+system.ruby.Locked_RMW_Read.latency_hist::gmean            2                      
+system.ruby.Locked_RMW_Read.latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Read.latency_hist::total           10                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::samples           10                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::mean            2                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::gmean            2                      
+system.ruby.Locked_RMW_Read.miss_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Read.miss_latency_hist::total           10                      
+system.ruby.Locked_RMW_Write.latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Write.latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Write.latency_hist::samples           10                      
+system.ruby.Locked_RMW_Write.latency_hist::mean            2                      
+system.ruby.Locked_RMW_Write.latency_hist::gmean            2                      
+system.ruby.Locked_RMW_Write.latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Write.latency_hist::total           10                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::samples           10                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::mean            2                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::gmean            2                      
+system.ruby.Locked_RMW_Write.miss_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Write.miss_latency_hist::total           10                      
+system.ruby.L1Cache.miss_mach_latency_hist::bucket_size            1                      
+system.ruby.L1Cache.miss_mach_latency_hist::max_bucket            9                      
+system.ruby.L1Cache.miss_mach_latency_hist::samples       112609                      
+system.ruby.L1Cache.miss_mach_latency_hist::mean            2                      
+system.ruby.L1Cache.miss_mach_latency_hist::gmean     2.000000                      
+system.ruby.L1Cache.miss_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |      112609    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.L1Cache.miss_mach_latency_hist::total       112609                      
+system.ruby.L2Cache.miss_mach_latency_hist::bucket_size            4                      
+system.ruby.L2Cache.miss_mach_latency_hist::max_bucket           39                      
+system.ruby.L2Cache.miss_mach_latency_hist::samples           59                      
+system.ruby.L2Cache.miss_mach_latency_hist::mean           20                      
+system.ruby.L2Cache.miss_mach_latency_hist::gmean    20.000000                      
+system.ruby.L2Cache.miss_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |          59    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.L2Cache.miss_mach_latency_hist::total           59                      
+system.ruby.Directory.hit_mach_latency_hist::bucket_size           64                      
+system.ruby.Directory.hit_mach_latency_hist::max_bucket          639                      
+system.ruby.Directory.hit_mach_latency_hist::samples         1535                      
+system.ruby.Directory.hit_mach_latency_hist::mean   208.449511                      
+system.ruby.Directory.hit_mach_latency_hist::gmean   208.002927                      
+system.ruby.Directory.hit_mach_latency_hist::stdev    15.847049                      
+system.ruby.Directory.hit_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |        1506     98.11%     98.11% |          19      1.24%     99.35% |          10      0.65%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Directory.hit_mach_latency_hist::total         1535                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::samples        16155                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::mean            2                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::gmean     2.000000                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |       16155    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::total        16155                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::bucket_size            4                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::max_bucket           39                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::samples            5                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::mean           20                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::gmean    20.000000                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           5    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::total            5                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::bucket_size           32                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::max_bucket          319                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::samples          175                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::mean   208.468571                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::gmean   208.231054                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::stdev    10.632194                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |         166     94.86%     94.86% |           9      5.14%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.Directory.hit_type_mach_latency_hist::total          175                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::samples        10090                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::mean            2                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::gmean     2.000000                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |       10090    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::total        10090                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::bucket_size           64                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::max_bucket          639                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::samples          322                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::mean   208.484472                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::gmean   208.014366                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::stdev    16.327683                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |         316     98.14%     98.14% |           3      0.93%     99.07% |           3      0.93%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.Directory.hit_type_mach_latency_hist::total          322                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::samples        86007                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::mean            2                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::gmean     2.000000                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |       86007    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::total        86007                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::bucket_size            4                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::max_bucket           39                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::samples           54                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::mean           20                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::gmean    20.000000                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |          54    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::total           54                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::bucket_size           64                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::max_bucket          639                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::samples         1034                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::mean   208.444874                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::gmean   207.968565                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::stdev    16.462617                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |        1011     97.78%     97.78% |          16      1.55%     99.32% |           7      0.68%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::total         1034                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::samples          337                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::mean            2                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::gmean     2.000000                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |         337    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::total          337                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::bucket_size           32                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::max_bucket          319                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::samples            4                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::mean          206                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::gmean   206.000000                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           4    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::total            4                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::samples           10                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::mean            2                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::gmean            2                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::total           10                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::samples           10                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::mean            2                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::gmean            2                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::total           10                      
+system.ruby.SQC_Controller.Fetch                   86      0.00%      0.00%
+system.ruby.SQC_Controller.TCC_AckS                 5      0.00%      0.00%
+system.ruby.SQC_Controller.I.Fetch                  5      0.00%      0.00%
+system.ruby.SQC_Controller.S.Fetch                 81      0.00%      0.00%
+system.ruby.SQC_Controller.I_S.TCC_AckS             5      0.00%      0.00%
+system.ruby.TCCdir_Controller.RdBlk                53      0.00%      0.00%
+system.ruby.TCCdir_Controller.RdBlkM               36      0.00%      0.00%
+system.ruby.TCCdir_Controller.RdBlkS                5      0.00%      0.00%
+system.ruby.TCCdir_Controller.CPUPrbResp           14      0.00%      0.00%
+system.ruby.TCCdir_Controller.ProbeAcksComplete           13      0.00%      0.00%
+system.ruby.TCCdir_Controller.CoreUnblock           17      0.00%      0.00%
+system.ruby.TCCdir_Controller.LastCoreUnblock            2      0.00%      0.00%
+system.ruby.TCCdir_Controller.NB_AckS               7      0.00%      0.00%
+system.ruby.TCCdir_Controller.NB_AckM               9      0.00%      0.00%
+system.ruby.TCCdir_Controller.PrbInvData          326      0.00%      0.00%
+system.ruby.TCCdir_Controller.PrbShrData         1209      0.00%      0.00%
+system.ruby.TCCdir_Controller.I.RdBlk               2      0.00%      0.00%
+system.ruby.TCCdir_Controller.I.RdBlkM              9      0.00%      0.00%
+system.ruby.TCCdir_Controller.I.RdBlkS              5      0.00%      0.00%
+system.ruby.TCCdir_Controller.I.PrbInvData          325      0.00%      0.00%
+system.ruby.TCCdir_Controller.I.PrbShrData         1200      0.00%      0.00%
+system.ruby.TCCdir_Controller.S.RdBlk               2      0.00%      0.00%
+system.ruby.TCCdir_Controller.S.PrbInvData            1      0.00%      0.00%
+system.ruby.TCCdir_Controller.M.RdBlkM              1      0.00%      0.00%
+system.ruby.TCCdir_Controller.M.PrbShrData            9      0.00%      0.00%
+system.ruby.TCCdir_Controller.CP_I.CPUPrbResp            2      0.00%      0.00%
+system.ruby.TCCdir_Controller.CP_I.ProbeAcksComplete            1      0.00%      0.00%
+system.ruby.TCCdir_Controller.CP_O.CPUPrbResp            9      0.00%      0.00%
+system.ruby.TCCdir_Controller.CP_O.ProbeAcksComplete            9      0.00%      0.00%
+system.ruby.TCCdir_Controller.I_M.RdBlkM           22      0.00%      0.00%
+system.ruby.TCCdir_Controller.I_M.NB_AckM            9      0.00%      0.00%
+system.ruby.TCCdir_Controller.I_ES.RdBlk           41      0.00%      0.00%
+system.ruby.TCCdir_Controller.I_ES.NB_AckS            2      0.00%      0.00%
+system.ruby.TCCdir_Controller.I_S.NB_AckS            5      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBS_S.CPUPrbResp            2      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBS_S.ProbeAcksComplete            2      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBM_M.CPUPrbResp            1      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBM_M.ProbeAcksComplete            1      0.00%      0.00%
+system.ruby.TCCdir_Controller.BB_M.CoreUnblock            1      0.00%      0.00%
+system.ruby.TCCdir_Controller.BB_S.LastCoreUnblock            2      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBB_S.RdBlk            8      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBB_S.CoreUnblock            7      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBB_M.RdBlkM            4      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBB_M.CoreUnblock            9      0.00%      0.00%
+system.ruby.TCP_Controller.Load          |           5     50.00%     50.00% |           5     50.00%    100.00%
+system.ruby.TCP_Controller.Load::total             10                      
+system.ruby.TCP_Controller.Store         |           9     50.00%     50.00% |           9     50.00%    100.00%
+system.ruby.TCP_Controller.Store::total            18                      
+system.ruby.TCP_Controller.TCC_AckS      |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.TCP_Controller.TCC_AckS::total            4                      
+system.ruby.TCP_Controller.TCC_AckM      |           5     50.00%     50.00% |           5     50.00%    100.00%
+system.ruby.TCP_Controller.TCC_AckM::total           10                      
+system.ruby.TCP_Controller.PrbInvData    |           1     33.33%     33.33% |           2     66.67%    100.00%
+system.ruby.TCP_Controller.PrbInvData::total            3                      
+system.ruby.TCP_Controller.PrbShrData    |           7     63.64%     63.64% |           4     36.36%    100.00%
+system.ruby.TCP_Controller.PrbShrData::total           11                      
+system.ruby.TCP_Controller.I.Load        |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.TCP_Controller.I.Load::total            4                      
+system.ruby.TCP_Controller.I.Store       |           5     50.00%     50.00% |           5     50.00%    100.00%
+system.ruby.TCP_Controller.I.Store::total           10                      
+system.ruby.TCP_Controller.S.Load        |           3     50.00%     50.00% |           3     50.00%    100.00%
+system.ruby.TCP_Controller.S.Load::total            6                      
+system.ruby.TCP_Controller.S.PrbInvData  |           1     50.00%     50.00% |           1     50.00%    100.00%
+system.ruby.TCP_Controller.S.PrbInvData::total            2                      
+system.ruby.TCP_Controller.S.PrbShrData  |           2    100.00%    100.00% |           0      0.00%    100.00%
+system.ruby.TCP_Controller.S.PrbShrData::total            2                      
+system.ruby.TCP_Controller.M.Store       |           4     50.00%     50.00% |           4     50.00%    100.00%
+system.ruby.TCP_Controller.M.Store::total            8                      
+system.ruby.TCP_Controller.M.PrbInvData  |           0      0.00%      0.00% |           1    100.00%    100.00%
+system.ruby.TCP_Controller.M.PrbInvData::total            1                      
+system.ruby.TCP_Controller.M.PrbShrData  |           5     55.56%     55.56% |           4     44.44%    100.00%
+system.ruby.TCP_Controller.M.PrbShrData::total            9                      
+system.ruby.TCP_Controller.I_M.TCC_AckM  |           5     50.00%     50.00% |           5     50.00%    100.00%
+system.ruby.TCP_Controller.I_M.TCC_AckM::total           10                      
+system.ruby.TCP_Controller.I_ES.TCC_AckS |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.TCP_Controller.I_ES.TCC_AckS::total            4                      
+
+---------- End Simulation Statistics   ----------
diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/config.ini b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/config.ini
new file mode 100644
index 000000000..33ae7164f
--- /dev/null
+++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/config.ini
@@ -0,0 +1,4063 @@
+[root]
+type=Root
+children=system
+eventq_index=0
+full_system=false
+sim_quantum=0
+time_sync_enable=false
+time_sync_period=100000000000
+time_sync_spin_threshold=100000000
+
+[system]
+type=System
+children=clk_domain cpu0 cpu1 cpu2 dispatcher_coalescer dispatcher_tlb dvfs_handler l1_coalescer0 l1_coalescer1 l1_tlb0 l1_tlb1 l2_coalescer l2_tlb l3_coalescer l3_tlb mem_ctrls piobus ruby sqc_coalescer sqc_tlb sys_port_proxy voltage_domain
+boot_osflags=a
+cache_line_size=64
+clk_domain=system.clk_domain
+eventq_index=0
+exit_on_work_items=false
+init_param=0
+kernel=
+kernel_addr_check=true
+load_addr_mask=1099511627775
+load_offset=0
+mem_mode=timing
+mem_ranges=0:536870911
+memories=system.mem_ctrls system.ruby.phys_mem
+mmap_using_noreserve=false
+multi_thread=false
+num_work_ids=16
+readfile=
+symbolfile=
+work_begin_ckpt_count=0
+work_begin_cpu_id_exit=-1
+work_begin_exit_count=0
+work_cpus_ckpt_count=0
+work_end_ckpt_count=0
+work_end_exit_count=0
+work_item_id=-1
+system_port=system.sys_port_proxy.slave[0]
+
+[system.clk_domain]
+type=SrcClockDomain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.voltage_domain
+
+[system.cpu0]
+type=TimingSimpleCPU
+children=apic_clk_domain clk_domain dtb interrupts isa itb tracer workload
+branchPred=Null
+checker=Null
+clk_domain=system.cpu0.clk_domain
+cpu_id=0
+do_checkpoint_insts=true
+do_quiesce=true
+do_statistics_insts=true
+dtb=system.cpu0.dtb
+eventq_index=0
+function_trace=false
+function_trace_start=0
+interrupts=system.cpu0.interrupts
+isa=system.cpu0.isa
+itb=system.cpu0.itb
+max_insts_all_threads=0
+max_insts_any_thread=0
+max_loads_all_threads=0
+max_loads_any_thread=0
+numThreads=1
+profile=0
+progress_interval=0
+simpoint_start_insts=
+socket_id=0
+switched_out=false
+system=system
+tracer=system.cpu0.tracer
+workload=system.cpu0.workload
+dcache_port=system.ruby.cp_cntrl0.sequencer.slave[1]
+icache_port=system.ruby.cp_cntrl0.sequencer.slave[0]
+
+[system.cpu0.apic_clk_domain]
+type=DerivedClockDomain
+clk_divider=16
+clk_domain=system.cpu0.clk_domain
+eventq_index=0
+
+[system.cpu0.clk_domain]
+type=SrcClockDomain
+clock=500
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.voltage_domain
+
+[system.cpu0.dtb]
+type=X86TLB
+children=walker
+eventq_index=0
+size=64
+walker=system.cpu0.dtb.walker
+
+[system.cpu0.dtb.walker]
+type=X86PagetableWalker
+clk_domain=system.cpu0.clk_domain
+eventq_index=0
+num_squash_per_cycle=4
+system=system
+port=system.ruby.cp_cntrl0.sequencer.slave[3]
+
+[system.cpu0.interrupts]
+type=X86LocalApic
+clk_domain=system.cpu0.apic_clk_domain
+eventq_index=0
+int_latency=1000
+pio_addr=2305843009213693952
+pio_latency=100000
+system=system
+int_master=system.ruby.cp_cntrl0.sequencer.slave[4]
+int_slave=system.ruby.cp_cntrl0.sequencer.master[1]
+pio=system.ruby.cp_cntrl0.sequencer.master[0]
+
+[system.cpu0.isa]
+type=X86ISA
+eventq_index=0
+
+[system.cpu0.itb]
+type=X86TLB
+children=walker
+eventq_index=0
+size=64
+walker=system.cpu0.itb.walker
+
+[system.cpu0.itb.walker]
+type=X86PagetableWalker
+clk_domain=system.cpu0.clk_domain
+eventq_index=0
+num_squash_per_cycle=4
+system=system
+port=system.ruby.cp_cntrl0.sequencer.slave[2]
+
+[system.cpu0.tracer]
+type=ExeTracer
+eventq_index=0
+
+[system.cpu0.workload]
+type=LiveProcess
+cmd=gpu-hello
+cwd=
+drivers=system.cpu2.cl_driver
+egid=100
+env=
+errout=cerr
+euid=100
+eventq_index=0
+executable=/dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello
+gid=100
+input=cin
+kvmInSE=false
+max_stack_size=67108864
+output=cout
+pid=100
+ppid=99
+simpoint=0
+system=system
+uid=100
+useArchPT=false
+
+[system.cpu1]
+type=Shader
+children=CUs0 CUs1 clk_domain
+CUs=system.cpu1.CUs0 system.cpu1.CUs1
+clk_domain=system.cpu1.clk_domain
+cpu_pointer=system.cpu0
+eventq_index=0
+globalmem=65536
+impl_kern_boundary_sync=true
+n_wf=8
+separate_acquire_release=false
+timing=true
+translation=false
+
+[system.cpu1.CUs0]
+type=ComputeUnit
+children=ldsBus localDataStore vector_register_file0 vector_register_file1 vector_register_file2 vector_register_file3 wavefronts00 wavefronts01 wavefronts02 wavefronts03 wavefronts04 wavefronts05 wavefronts06 wavefronts07 wavefronts08 wavefronts09 wavefronts10 wavefronts11 wavefronts12 wavefronts13 wavefronts14 wavefronts15 wavefronts16 wavefronts17 wavefronts18 wavefronts19 wavefronts20 wavefronts21 wavefronts22 wavefronts23 wavefronts24 wavefronts25 wavefronts26 wavefronts27 wavefronts28 wavefronts29 wavefronts30 wavefronts31
+clk_domain=system.cpu1.clk_domain
+coalescer_to_vrf_bus_width=32
+countPages=false
+cu_id=0
+debugSegFault=false
+dpbypass_pipe_length=4
+eventq_index=0
+execPolicy=OLDEST-FIRST
+functionalTLB=true
+global_mem_queue_size=256
+issue_period=4
+localDataStore=system.cpu1.CUs0.localDataStore
+localMemBarrier=false
+local_mem_queue_size=256
+mem_req_latency=9
+mem_resp_latency=9
+n_wf=8
+num_SIMDs=4
+num_global_mem_pipes=1
+num_shared_mem_pipes=1
+perLaneTLB=false
+prefetch_depth=0
+prefetch_prev_type=PF_PHASE
+prefetch_stride=1
+spbypass_pipe_length=4
+system=system
+vector_register_file=system.cpu1.CUs0.vector_register_file0 system.cpu1.CUs0.vector_register_file1 system.cpu1.CUs0.vector_register_file2 system.cpu1.CUs0.vector_register_file3
+vrf_to_coalescer_bus_width=32
+wavefronts=system.cpu1.CUs0.wavefronts00 system.cpu1.CUs0.wavefronts01 system.cpu1.CUs0.wavefronts02 system.cpu1.CUs0.wavefronts03 system.cpu1.CUs0.wavefronts04 system.cpu1.CUs0.wavefronts05 system.cpu1.CUs0.wavefronts06 system.cpu1.CUs0.wavefronts07 system.cpu1.CUs0.wavefronts08 system.cpu1.CUs0.wavefronts09 system.cpu1.CUs0.wavefronts10 system.cpu1.CUs0.wavefronts11 system.cpu1.CUs0.wavefronts12 system.cpu1.CUs0.wavefronts13 system.cpu1.CUs0.wavefronts14 system.cpu1.CUs0.wavefronts15 system.cpu1.CUs0.wavefronts16 system.cpu1.CUs0.wavefronts17 system.cpu1.CUs0.wavefronts18 system.cpu1.CUs0.wavefronts19 system.cpu1.CUs0.wavefronts20 system.cpu1.CUs0.wavefronts21 system.cpu1.CUs0.wavefronts22 system.cpu1.CUs0.wavefronts23 system.cpu1.CUs0.wavefronts24 system.cpu1.CUs0.wavefronts25 system.cpu1.CUs0.wavefronts26 system.cpu1.CUs0.wavefronts27 system.cpu1.CUs0.wavefronts28 system.cpu1.CUs0.wavefronts29 system.cpu1.CUs0.wavefronts30 system.cpu1.CUs0.wavefronts31
+wfSize=64
+xactCasMode=false
+ldsPort=system.cpu1.CUs0.ldsBus.slave
+memory_port=system.ruby.tcp_cntrl0.coalescer.slave[0] system.ruby.tcp_cntrl0.coalescer.slave[1] system.ruby.tcp_cntrl0.coalescer.slave[2] system.ruby.tcp_cntrl0.coalescer.slave[3] system.ruby.tcp_cntrl0.coalescer.slave[4] system.ruby.tcp_cntrl0.coalescer.slave[5] system.ruby.tcp_cntrl0.coalescer.slave[6] system.ruby.tcp_cntrl0.coalescer.slave[7] system.ruby.tcp_cntrl0.coalescer.slave[8] system.ruby.tcp_cntrl0.coalescer.slave[9] system.ruby.tcp_cntrl0.coalescer.slave[10] system.ruby.tcp_cntrl0.coalescer.slave[11] system.ruby.tcp_cntrl0.coalescer.slave[12] system.ruby.tcp_cntrl0.coalescer.slave[13] system.ruby.tcp_cntrl0.coalescer.slave[14] system.ruby.tcp_cntrl0.coalescer.slave[15] system.ruby.tcp_cntrl0.coalescer.slave[16] system.ruby.tcp_cntrl0.coalescer.slave[17] system.ruby.tcp_cntrl0.coalescer.slave[18] system.ruby.tcp_cntrl0.coalescer.slave[19] system.ruby.tcp_cntrl0.coalescer.slave[20] system.ruby.tcp_cntrl0.coalescer.slave[21] system.ruby.tcp_cntrl0.coalescer.slave[22] system.ruby.tcp_cntrl0.coalescer.slave[23] system.ruby.tcp_cntrl0.coalescer.slave[24] system.ruby.tcp_cntrl0.coalescer.slave[25] system.ruby.tcp_cntrl0.coalescer.slave[26] system.ruby.tcp_cntrl0.coalescer.slave[27] system.ruby.tcp_cntrl0.coalescer.slave[28] system.ruby.tcp_cntrl0.coalescer.slave[29] system.ruby.tcp_cntrl0.coalescer.slave[30] system.ruby.tcp_cntrl0.coalescer.slave[31] system.ruby.tcp_cntrl0.coalescer.slave[32] system.ruby.tcp_cntrl0.coalescer.slave[33] system.ruby.tcp_cntrl0.coalescer.slave[34] system.ruby.tcp_cntrl0.coalescer.slave[35] system.ruby.tcp_cntrl0.coalescer.slave[36] system.ruby.tcp_cntrl0.coalescer.slave[37] system.ruby.tcp_cntrl0.coalescer.slave[38] system.ruby.tcp_cntrl0.coalescer.slave[39] system.ruby.tcp_cntrl0.coalescer.slave[40] system.ruby.tcp_cntrl0.coalescer.slave[41] system.ruby.tcp_cntrl0.coalescer.slave[42] system.ruby.tcp_cntrl0.coalescer.slave[43] system.ruby.tcp_cntrl0.coalescer.slave[44] system.ruby.tcp_cntrl0.coalescer.slave[45] system.ruby.tcp_cntrl0.coalescer.slave[46] system.ruby.tcp_cntrl0.coalescer.slave[47] system.ruby.tcp_cntrl0.coalescer.slave[48] system.ruby.tcp_cntrl0.coalescer.slave[49] system.ruby.tcp_cntrl0.coalescer.slave[50] system.ruby.tcp_cntrl0.coalescer.slave[51] system.ruby.tcp_cntrl0.coalescer.slave[52] system.ruby.tcp_cntrl0.coalescer.slave[53] system.ruby.tcp_cntrl0.coalescer.slave[54] system.ruby.tcp_cntrl0.coalescer.slave[55] system.ruby.tcp_cntrl0.coalescer.slave[56] system.ruby.tcp_cntrl0.coalescer.slave[57] system.ruby.tcp_cntrl0.coalescer.slave[58] system.ruby.tcp_cntrl0.coalescer.slave[59] system.ruby.tcp_cntrl0.coalescer.slave[60] system.ruby.tcp_cntrl0.coalescer.slave[61] system.ruby.tcp_cntrl0.coalescer.slave[62] system.ruby.tcp_cntrl0.coalescer.slave[63]
+sqc_port=system.ruby.sqc_cntrl0.sequencer.slave[0]
+sqc_tlb_port=system.sqc_coalescer.slave[0]
+translation_port=system.l1_coalescer0.slave[0]
+
+[system.cpu1.CUs0.ldsBus]
+type=Bridge
+clk_domain=system.cpu1.clk_domain
+delay=0
+eventq_index=0
+ranges=0:18446744073709551615
+req_size=16
+resp_size=16
+master=system.cpu1.CUs0.localDataStore.cuPort
+slave=system.cpu1.CUs0.ldsPort
+
+[system.cpu1.CUs0.localDataStore]
+type=LdsState
+bankConflictPenalty=1
+banks=32
+clk_domain=system.cpu1.clk_domain
+eventq_index=0
+range=0:65535
+size=65536
+cuPort=system.cpu1.CUs0.ldsBus.master
+
+[system.cpu1.CUs0.vector_register_file0]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=0
+
+[system.cpu1.CUs0.vector_register_file1]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=1
+
+[system.cpu1.CUs0.vector_register_file2]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=2
+
+[system.cpu1.CUs0.vector_register_file3]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=3
+
+[system.cpu1.CUs0.wavefronts00]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=0
+
+[system.cpu1.CUs0.wavefronts01]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=1
+
+[system.cpu1.CUs0.wavefronts02]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=2
+
+[system.cpu1.CUs0.wavefronts03]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=3
+
+[system.cpu1.CUs0.wavefronts04]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=4
+
+[system.cpu1.CUs0.wavefronts05]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=5
+
+[system.cpu1.CUs0.wavefronts06]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=6
+
+[system.cpu1.CUs0.wavefronts07]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=7
+
+[system.cpu1.CUs0.wavefronts08]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=0
+
+[system.cpu1.CUs0.wavefronts09]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=1
+
+[system.cpu1.CUs0.wavefronts10]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=2
+
+[system.cpu1.CUs0.wavefronts11]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=3
+
+[system.cpu1.CUs0.wavefronts12]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=4
+
+[system.cpu1.CUs0.wavefronts13]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=5
+
+[system.cpu1.CUs0.wavefronts14]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=6
+
+[system.cpu1.CUs0.wavefronts15]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=7
+
+[system.cpu1.CUs0.wavefronts16]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=0
+
+[system.cpu1.CUs0.wavefronts17]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=1
+
+[system.cpu1.CUs0.wavefronts18]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=2
+
+[system.cpu1.CUs0.wavefronts19]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=3
+
+[system.cpu1.CUs0.wavefronts20]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=4
+
+[system.cpu1.CUs0.wavefronts21]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=5
+
+[system.cpu1.CUs0.wavefronts22]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=6
+
+[system.cpu1.CUs0.wavefronts23]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=7
+
+[system.cpu1.CUs0.wavefronts24]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=0
+
+[system.cpu1.CUs0.wavefronts25]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=1
+
+[system.cpu1.CUs0.wavefronts26]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=2
+
+[system.cpu1.CUs0.wavefronts27]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=3
+
+[system.cpu1.CUs0.wavefronts28]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=4
+
+[system.cpu1.CUs0.wavefronts29]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=5
+
+[system.cpu1.CUs0.wavefronts30]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=6
+
+[system.cpu1.CUs0.wavefronts31]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=7
+
+[system.cpu1.CUs1]
+type=ComputeUnit
+children=ldsBus localDataStore vector_register_file0 vector_register_file1 vector_register_file2 vector_register_file3 wavefronts00 wavefronts01 wavefronts02 wavefronts03 wavefronts04 wavefronts05 wavefronts06 wavefronts07 wavefronts08 wavefronts09 wavefronts10 wavefronts11 wavefronts12 wavefronts13 wavefronts14 wavefronts15 wavefronts16 wavefronts17 wavefronts18 wavefronts19 wavefronts20 wavefronts21 wavefronts22 wavefronts23 wavefronts24 wavefronts25 wavefronts26 wavefronts27 wavefronts28 wavefronts29 wavefronts30 wavefronts31
+clk_domain=system.cpu1.clk_domain
+coalescer_to_vrf_bus_width=32
+countPages=false
+cu_id=1
+debugSegFault=false
+dpbypass_pipe_length=4
+eventq_index=0
+execPolicy=OLDEST-FIRST
+functionalTLB=true
+global_mem_queue_size=256
+issue_period=4
+localDataStore=system.cpu1.CUs1.localDataStore
+localMemBarrier=false
+local_mem_queue_size=256
+mem_req_latency=9
+mem_resp_latency=9
+n_wf=8
+num_SIMDs=4
+num_global_mem_pipes=1
+num_shared_mem_pipes=1
+perLaneTLB=false
+prefetch_depth=0
+prefetch_prev_type=PF_PHASE
+prefetch_stride=1
+spbypass_pipe_length=4
+system=system
+vector_register_file=system.cpu1.CUs1.vector_register_file0 system.cpu1.CUs1.vector_register_file1 system.cpu1.CUs1.vector_register_file2 system.cpu1.CUs1.vector_register_file3
+vrf_to_coalescer_bus_width=32
+wavefronts=system.cpu1.CUs1.wavefronts00 system.cpu1.CUs1.wavefronts01 system.cpu1.CUs1.wavefronts02 system.cpu1.CUs1.wavefronts03 system.cpu1.CUs1.wavefronts04 system.cpu1.CUs1.wavefronts05 system.cpu1.CUs1.wavefronts06 system.cpu1.CUs1.wavefronts07 system.cpu1.CUs1.wavefronts08 system.cpu1.CUs1.wavefronts09 system.cpu1.CUs1.wavefronts10 system.cpu1.CUs1.wavefronts11 system.cpu1.CUs1.wavefronts12 system.cpu1.CUs1.wavefronts13 system.cpu1.CUs1.wavefronts14 system.cpu1.CUs1.wavefronts15 system.cpu1.CUs1.wavefronts16 system.cpu1.CUs1.wavefronts17 system.cpu1.CUs1.wavefronts18 system.cpu1.CUs1.wavefronts19 system.cpu1.CUs1.wavefronts20 system.cpu1.CUs1.wavefronts21 system.cpu1.CUs1.wavefronts22 system.cpu1.CUs1.wavefronts23 system.cpu1.CUs1.wavefronts24 system.cpu1.CUs1.wavefronts25 system.cpu1.CUs1.wavefronts26 system.cpu1.CUs1.wavefronts27 system.cpu1.CUs1.wavefronts28 system.cpu1.CUs1.wavefronts29 system.cpu1.CUs1.wavefronts30 system.cpu1.CUs1.wavefronts31
+wfSize=64
+xactCasMode=false
+ldsPort=system.cpu1.CUs1.ldsBus.slave
+memory_port=system.ruby.tcp_cntrl1.coalescer.slave[0] system.ruby.tcp_cntrl1.coalescer.slave[1] system.ruby.tcp_cntrl1.coalescer.slave[2] system.ruby.tcp_cntrl1.coalescer.slave[3] system.ruby.tcp_cntrl1.coalescer.slave[4] system.ruby.tcp_cntrl1.coalescer.slave[5] system.ruby.tcp_cntrl1.coalescer.slave[6] system.ruby.tcp_cntrl1.coalescer.slave[7] system.ruby.tcp_cntrl1.coalescer.slave[8] system.ruby.tcp_cntrl1.coalescer.slave[9] system.ruby.tcp_cntrl1.coalescer.slave[10] system.ruby.tcp_cntrl1.coalescer.slave[11] system.ruby.tcp_cntrl1.coalescer.slave[12] system.ruby.tcp_cntrl1.coalescer.slave[13] system.ruby.tcp_cntrl1.coalescer.slave[14] system.ruby.tcp_cntrl1.coalescer.slave[15] system.ruby.tcp_cntrl1.coalescer.slave[16] system.ruby.tcp_cntrl1.coalescer.slave[17] system.ruby.tcp_cntrl1.coalescer.slave[18] system.ruby.tcp_cntrl1.coalescer.slave[19] system.ruby.tcp_cntrl1.coalescer.slave[20] system.ruby.tcp_cntrl1.coalescer.slave[21] system.ruby.tcp_cntrl1.coalescer.slave[22] system.ruby.tcp_cntrl1.coalescer.slave[23] system.ruby.tcp_cntrl1.coalescer.slave[24] system.ruby.tcp_cntrl1.coalescer.slave[25] system.ruby.tcp_cntrl1.coalescer.slave[26] system.ruby.tcp_cntrl1.coalescer.slave[27] system.ruby.tcp_cntrl1.coalescer.slave[28] system.ruby.tcp_cntrl1.coalescer.slave[29] system.ruby.tcp_cntrl1.coalescer.slave[30] system.ruby.tcp_cntrl1.coalescer.slave[31] system.ruby.tcp_cntrl1.coalescer.slave[32] system.ruby.tcp_cntrl1.coalescer.slave[33] system.ruby.tcp_cntrl1.coalescer.slave[34] system.ruby.tcp_cntrl1.coalescer.slave[35] system.ruby.tcp_cntrl1.coalescer.slave[36] system.ruby.tcp_cntrl1.coalescer.slave[37] system.ruby.tcp_cntrl1.coalescer.slave[38] system.ruby.tcp_cntrl1.coalescer.slave[39] system.ruby.tcp_cntrl1.coalescer.slave[40] system.ruby.tcp_cntrl1.coalescer.slave[41] system.ruby.tcp_cntrl1.coalescer.slave[42] system.ruby.tcp_cntrl1.coalescer.slave[43] system.ruby.tcp_cntrl1.coalescer.slave[44] system.ruby.tcp_cntrl1.coalescer.slave[45] system.ruby.tcp_cntrl1.coalescer.slave[46] system.ruby.tcp_cntrl1.coalescer.slave[47] system.ruby.tcp_cntrl1.coalescer.slave[48] system.ruby.tcp_cntrl1.coalescer.slave[49] system.ruby.tcp_cntrl1.coalescer.slave[50] system.ruby.tcp_cntrl1.coalescer.slave[51] system.ruby.tcp_cntrl1.coalescer.slave[52] system.ruby.tcp_cntrl1.coalescer.slave[53] system.ruby.tcp_cntrl1.coalescer.slave[54] system.ruby.tcp_cntrl1.coalescer.slave[55] system.ruby.tcp_cntrl1.coalescer.slave[56] system.ruby.tcp_cntrl1.coalescer.slave[57] system.ruby.tcp_cntrl1.coalescer.slave[58] system.ruby.tcp_cntrl1.coalescer.slave[59] system.ruby.tcp_cntrl1.coalescer.slave[60] system.ruby.tcp_cntrl1.coalescer.slave[61] system.ruby.tcp_cntrl1.coalescer.slave[62] system.ruby.tcp_cntrl1.coalescer.slave[63]
+sqc_port=system.ruby.sqc_cntrl0.sequencer.slave[1]
+sqc_tlb_port=system.sqc_coalescer.slave[1]
+translation_port=system.l1_coalescer1.slave[0]
+
+[system.cpu1.CUs1.ldsBus]
+type=Bridge
+clk_domain=system.cpu1.clk_domain
+delay=0
+eventq_index=0
+ranges=0:18446744073709551615
+req_size=16
+resp_size=16
+master=system.cpu1.CUs1.localDataStore.cuPort
+slave=system.cpu1.CUs1.ldsPort
+
+[system.cpu1.CUs1.localDataStore]
+type=LdsState
+bankConflictPenalty=1
+banks=32
+clk_domain=system.cpu1.clk_domain
+eventq_index=0
+range=0:65535
+size=65536
+cuPort=system.cpu1.CUs1.ldsBus.master
+
+[system.cpu1.CUs1.vector_register_file0]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=0
+
+[system.cpu1.CUs1.vector_register_file1]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=1
+
+[system.cpu1.CUs1.vector_register_file2]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=2
+
+[system.cpu1.CUs1.vector_register_file3]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=3
+
+[system.cpu1.CUs1.wavefronts00]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=0
+
+[system.cpu1.CUs1.wavefronts01]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=1
+
+[system.cpu1.CUs1.wavefronts02]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=2
+
+[system.cpu1.CUs1.wavefronts03]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=3
+
+[system.cpu1.CUs1.wavefronts04]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=4
+
+[system.cpu1.CUs1.wavefronts05]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=5
+
+[system.cpu1.CUs1.wavefronts06]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=6
+
+[system.cpu1.CUs1.wavefronts07]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=7
+
+[system.cpu1.CUs1.wavefronts08]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=0
+
+[system.cpu1.CUs1.wavefronts09]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=1
+
+[system.cpu1.CUs1.wavefronts10]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=2
+
+[system.cpu1.CUs1.wavefronts11]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=3
+
+[system.cpu1.CUs1.wavefronts12]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=4
+
+[system.cpu1.CUs1.wavefronts13]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=5
+
+[system.cpu1.CUs1.wavefronts14]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=6
+
+[system.cpu1.CUs1.wavefronts15]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=7
+
+[system.cpu1.CUs1.wavefronts16]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=0
+
+[system.cpu1.CUs1.wavefronts17]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=1
+
+[system.cpu1.CUs1.wavefronts18]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=2
+
+[system.cpu1.CUs1.wavefronts19]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=3
+
+[system.cpu1.CUs1.wavefronts20]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=4
+
+[system.cpu1.CUs1.wavefronts21]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=5
+
+[system.cpu1.CUs1.wavefronts22]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=6
+
+[system.cpu1.CUs1.wavefronts23]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=7
+
+[system.cpu1.CUs1.wavefronts24]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=0
+
+[system.cpu1.CUs1.wavefronts25]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=1
+
+[system.cpu1.CUs1.wavefronts26]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=2
+
+[system.cpu1.CUs1.wavefronts27]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=3
+
+[system.cpu1.CUs1.wavefronts28]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=4
+
+[system.cpu1.CUs1.wavefronts29]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=5
+
+[system.cpu1.CUs1.wavefronts30]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=6
+
+[system.cpu1.CUs1.wavefronts31]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=7
+
+[system.cpu1.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.cpu1.clk_domain.voltage_domain
+
+[system.cpu1.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.cpu2]
+type=GpuDispatcher
+children=cl_driver
+cl_driver=system.cpu2.cl_driver
+clk_domain=system.clk_domain
+cpu=system.cpu0
+eventq_index=0
+pio_addr=8589934592
+pio_latency=1000
+shader_pointer=system.cpu1
+system=system
+dma=system.piobus.slave[1]
+pio=system.piobus.master[0]
+translation_port=system.dispatcher_coalescer.slave[0]
+
+[system.cpu2.cl_driver]
+type=ClDriver
+codefile=/dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm
+eventq_index=0
+filename=hsa
+
+[system.dispatcher_coalescer]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.dispatcher_coalescer.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.dispatcher_tlb.slave[0]
+slave=system.cpu2.translation_port
+
+[system.dispatcher_coalescer.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.dispatcher_coalescer.clk_domain.voltage_domain
+
+[system.dispatcher_coalescer.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.dispatcher_tlb]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.dispatcher_tlb.clk_domain
+eventq_index=0
+hitLatency=1
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=32
+master=system.l2_coalescer.slave[1]
+slave=system.dispatcher_coalescer.master[0]
+
+[system.dispatcher_tlb.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.dispatcher_tlb.clk_domain.voltage_domain
+
+[system.dispatcher_tlb.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.dvfs_handler]
+type=DVFSHandler
+domains=
+enable=false
+eventq_index=0
+sys_clk_domain=system.clk_domain
+transition_latency=100000000
+
+[system.l1_coalescer0]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.l1_coalescer0.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.l1_tlb0.slave[0]
+slave=system.cpu1.CUs0.translation_port[0]
+
+[system.l1_coalescer0.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l1_coalescer0.clk_domain.voltage_domain
+
+[system.l1_coalescer0.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l1_coalescer1]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.l1_coalescer1.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.l1_tlb1.slave[0]
+slave=system.cpu1.CUs1.translation_port[0]
+
+[system.l1_coalescer1.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l1_coalescer1.clk_domain.voltage_domain
+
+[system.l1_coalescer1.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l1_tlb0]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.l1_tlb0.clk_domain
+eventq_index=0
+hitLatency=1
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=32
+master=system.l2_coalescer.slave[2]
+slave=system.l1_coalescer0.master[0]
+
+[system.l1_tlb0.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l1_tlb0.clk_domain.voltage_domain
+
+[system.l1_tlb0.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l1_tlb1]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.l1_tlb1.clk_domain
+eventq_index=0
+hitLatency=1
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=32
+master=system.l2_coalescer.slave[3]
+slave=system.l1_coalescer1.master[0]
+
+[system.l1_tlb1.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l1_tlb1.clk_domain.voltage_domain
+
+[system.l1_tlb1.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l2_coalescer]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.l2_coalescer.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.l2_tlb.slave[0]
+slave=system.sqc_tlb.master[0] system.dispatcher_tlb.master[0] system.l1_tlb0.master[0] system.l1_tlb1.master[0]
+
+[system.l2_coalescer.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l2_coalescer.clk_domain.voltage_domain
+
+[system.l2_coalescer.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l2_tlb]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.l2_tlb.clk_domain
+eventq_index=0
+hitLatency=69
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=4096
+master=system.l3_coalescer.slave[0]
+slave=system.l2_coalescer.master[0]
+
+[system.l2_tlb.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l2_tlb.clk_domain.voltage_domain
+
+[system.l2_tlb.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l3_coalescer]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.l3_coalescer.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.l3_tlb.slave[0]
+slave=system.l2_tlb.master[0]
+
+[system.l3_coalescer.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l3_coalescer.clk_domain.voltage_domain
+
+[system.l3_coalescer.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l3_tlb]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.l3_tlb.clk_domain
+eventq_index=0
+hitLatency=150
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=8192
+slave=system.l3_coalescer.master[0]
+
+[system.l3_tlb.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l3_tlb.clk_domain.voltage_domain
+
+[system.l3_tlb.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.mem_ctrls]
+type=DRAMCtrl
+IDD0=0.075000
+IDD02=0.000000
+IDD2N=0.050000
+IDD2N2=0.000000
+IDD2P0=0.000000
+IDD2P02=0.000000
+IDD2P1=0.000000
+IDD2P12=0.000000
+IDD3N=0.057000
+IDD3N2=0.000000
+IDD3P0=0.000000
+IDD3P02=0.000000
+IDD3P1=0.000000
+IDD3P12=0.000000
+IDD4R=0.187000
+IDD4R2=0.000000
+IDD4W=0.165000
+IDD4W2=0.000000
+IDD5=0.220000
+IDD52=0.000000
+IDD6=0.000000
+IDD62=0.000000
+VDD=1.500000
+VDD2=0.000000
+activation_limit=4
+addr_mapping=RoRaBaCoCh
+bank_groups_per_rank=0
+banks_per_rank=8
+burst_length=8
+channels=1
+clk_domain=system.clk_domain
+conf_table_reported=true
+device_bus_width=8
+device_rowbuffer_size=1024
+device_size=536870912
+devices_per_rank=8
+dll=true
+eventq_index=0
+in_addr_map=true
+max_accesses_per_row=16
+mem_sched_policy=frfcfs
+min_writes_per_switch=16
+null=false
+page_policy=open_adaptive
+range=0:536870911
+ranks_per_channel=2
+read_buffer_size=32
+static_backend_latency=10000
+static_frontend_latency=10000
+tBURST=5000
+tCCD_L=0
+tCK=1250
+tCL=13750
+tCS=2500
+tRAS=35000
+tRCD=13750
+tREFI=7800000
+tRFC=260000
+tRP=13750
+tRRD=6000
+tRRD_L=0
+tRTP=7500
+tRTW=2500
+tWR=15000
+tWTR=7500
+tXAW=30000
+tXP=0
+tXPDLL=0
+tXS=0
+tXSDLL=0
+write_buffer_size=64
+write_high_thresh_perc=85
+write_low_thresh_perc=50
+port=system.ruby.dir_cntrl0.memory
+
+[system.piobus]
+type=NoncoherentXBar
+clk_domain=system.clk_domain
+eventq_index=0
+forward_latency=0
+frontend_latency=0
+response_latency=0
+use_default_range=false
+width=32
+master=system.cpu2.pio
+slave=system.ruby.cp_cntrl0.sequencer.mem_master_port system.cpu2.dma
+
+[system.ruby]
+type=RubySystem
+children=clk_domain cp_cntrl0 dir_cntrl0 network phys_mem sqc_cntrl0 tcc_cntrl0 tcp_cntrl0 tcp_cntrl1
+access_backing_store=true
+all_instructions=false
+block_size_bytes=64
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+hot_lines=false
+memory_size_bits=48
+num_of_sequencers=5
+number_of_virtual_networks=10
+phys_mem=system.ruby.phys_mem
+randomization=false
+
+[system.ruby.clk_domain]
+type=SrcClockDomain
+clock=500
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.voltage_domain
+
+[system.ruby.cp_cntrl0]
+type=CorePair_Controller
+children=L1D0cache L1D1cache L1Icache L2cache mandatoryQueue probeToCore requestFromCore responseFromCore responseToCore sequencer sequencer1 triggerQueue unblockFromCore
+L1D0cache=system.ruby.cp_cntrl0.L1D0cache
+L1D1cache=system.ruby.cp_cntrl0.L1D1cache
+L1Icache=system.ruby.cp_cntrl0.L1Icache
+L2cache=system.ruby.cp_cntrl0.L2cache
+buffer_size=0
+clk_domain=system.ruby.clk_domain
+cluster_id=0
+eventq_index=0
+issue_latency=120
+l2_hit_latency=18
+mandatoryQueue=system.ruby.cp_cntrl0.mandatoryQueue
+number_of_TBEs=256
+probeToCore=system.ruby.cp_cntrl0.probeToCore
+recycle_latency=10
+requestFromCore=system.ruby.cp_cntrl0.requestFromCore
+responseFromCore=system.ruby.cp_cntrl0.responseFromCore
+responseToCore=system.ruby.cp_cntrl0.responseToCore
+ruby_system=system.ruby
+send_evictions=true
+sequencer=system.ruby.cp_cntrl0.sequencer
+sequencer1=system.ruby.cp_cntrl0.sequencer1
+system=system
+transitions_per_cycle=32
+triggerQueue=system.ruby.cp_cntrl0.triggerQueue
+unblockFromCore=system.ruby.cp_cntrl0.unblockFromCore
+version=0
+
+[system.ruby.cp_cntrl0.L1D0cache]
+type=RubyCache
+children=replacement_policy
+assoc=2
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=2
+eventq_index=0
+is_icache=false
+replacement_policy=system.ruby.cp_cntrl0.L1D0cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=65536
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=2
+
+[system.ruby.cp_cntrl0.L1D0cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=2
+block_size=64
+eventq_index=0
+size=65536
+
+[system.ruby.cp_cntrl0.L1D1cache]
+type=RubyCache
+children=replacement_policy
+assoc=2
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=2
+eventq_index=0
+is_icache=false
+replacement_policy=system.ruby.cp_cntrl0.L1D1cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=65536
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=2
+
+[system.ruby.cp_cntrl0.L1D1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=2
+block_size=64
+eventq_index=0
+size=65536
+
+[system.ruby.cp_cntrl0.L1Icache]
+type=RubyCache
+children=replacement_policy
+assoc=2
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=2
+eventq_index=0
+is_icache=false
+replacement_policy=system.ruby.cp_cntrl0.L1Icache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=32768
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=2
+
+[system.ruby.cp_cntrl0.L1Icache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=2
+block_size=64
+eventq_index=0
+size=32768
+
+[system.ruby.cp_cntrl0.L2cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.ruby.cp_cntrl0.L2cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=2097152
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=16
+
+[system.ruby.cp_cntrl0.L2cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=2097152
+
+[system.ruby.cp_cntrl0.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.ruby.cp_cntrl0.probeToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[3]
+
+[system.ruby.cp_cntrl0.requestFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[2]
+
+[system.ruby.cp_cntrl0.responseFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[3]
+
+[system.ruby.cp_cntrl0.responseToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[4]
+
+[system.ruby.cp_cntrl0.sequencer]
+type=RubySequencer
+clk_domain=system.ruby.clk_domain
+coreid=0
+dcache=system.ruby.cp_cntrl0.L1D0cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.ruby.cp_cntrl0.L1Icache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=0
+master=system.cpu0.interrupts.pio system.cpu0.interrupts.int_slave
+mem_master_port=system.piobus.slave[0]
+slave=system.cpu0.icache_port system.cpu0.dcache_port system.cpu0.itb.walker.port system.cpu0.dtb.walker.port system.cpu0.interrupts.int_master
+
+[system.ruby.cp_cntrl0.sequencer1]
+type=RubySequencer
+clk_domain=system.ruby.clk_domain
+coreid=1
+dcache=system.ruby.cp_cntrl0.L1D1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.ruby.cp_cntrl0.L1Icache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=1
+
+[system.ruby.cp_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.cp_cntrl0.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[4]
+
+[system.ruby.dir_cntrl0]
+type=Directory_Controller
+children=L3CacheMemory L3triggerQueue directory probeToCore requestFromCores responseFromCores responseFromMemory responseToCore triggerQueue unblockFromCores
+CPUonly=false
+L3CacheMemory=system.ruby.dir_cntrl0.L3CacheMemory
+L3triggerQueue=system.ruby.dir_cntrl0.L3triggerQueue
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.ruby.clk_domain
+cluster_id=0
+directory=system.ruby.dir_cntrl0.directory
+eventq_index=0
+l3_hit_latency=15
+noTCCdir=true
+number_of_TBEs=256
+probeToCore=system.ruby.dir_cntrl0.probeToCore
+recycle_latency=10
+requestFromCores=system.ruby.dir_cntrl0.requestFromCores
+responseFromCores=system.ruby.dir_cntrl0.responseFromCores
+responseFromMemory=system.ruby.dir_cntrl0.responseFromMemory
+responseToCore=system.ruby.dir_cntrl0.responseToCore
+response_latency=30
+ruby_system=system.ruby
+system=system
+to_memory_controller_latency=1
+transitions_per_cycle=32
+triggerQueue=system.ruby.dir_cntrl0.triggerQueue
+unblockFromCores=system.ruby.dir_cntrl0.unblockFromCores
+useL3OnWT=false
+version=0
+memory=system.mem_ctrls.port
+
+[system.ruby.dir_cntrl0.L3CacheMemory]
+type=RubyCache
+children=replacement_policy
+assoc=16
+block_size=0
+dataAccessLatency=20
+dataArrayBanks=16.0
+eventq_index=0
+is_icache=false
+replacement_policy=system.ruby.dir_cntrl0.L3CacheMemory.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=16777216
+start_index_bit=6
+tagAccessLatency=15
+tagArrayBanks=16.0
+
+[system.ruby.dir_cntrl0.L3CacheMemory.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=16
+block_size=64
+eventq_index=0
+size=16777216
+
+[system.ruby.dir_cntrl0.L3triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.dir_cntrl0.directory]
+type=RubyDirectoryMemory
+eventq_index=0
+numa_high_bit=5
+size=536870912
+version=0
+
+[system.ruby.dir_cntrl0.probeToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[0]
+
+[system.ruby.dir_cntrl0.requestFromCores]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[0]
+
+[system.ruby.dir_cntrl0.responseFromCores]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[1]
+
+[system.ruby.dir_cntrl0.responseFromMemory]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.ruby.dir_cntrl0.responseToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[1]
+
+[system.ruby.dir_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.dir_cntrl0.unblockFromCores]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[2]
+
+[system.ruby.network]
+type=SimpleNetwork
+children=ext_links0 ext_links1 ext_links2 ext_links3 ext_links4 ext_links5 int_link_buffers00 int_link_buffers01 int_link_buffers02 int_link_buffers03 int_link_buffers04 int_link_buffers05 int_link_buffers06 int_link_buffers07 int_link_buffers08 int_link_buffers09 int_link_buffers10 int_link_buffers11 int_link_buffers12 int_link_buffers13 int_link_buffers14 int_link_buffers15 int_link_buffers16 int_link_buffers17 int_link_buffers18 int_link_buffers19 int_link_buffers20 int_link_buffers21 int_link_buffers22 int_link_buffers23 int_link_buffers24 int_link_buffers25 int_link_buffers26 int_link_buffers27 int_link_buffers28 int_link_buffers29 int_link_buffers30 int_link_buffers31 int_link_buffers32 int_link_buffers33 int_link_buffers34 int_link_buffers35 int_link_buffers36 int_link_buffers37 int_link_buffers38 int_link_buffers39 int_links0 int_links1
+adaptive_routing=false
+buffer_size=0
+clk_domain=system.ruby.clk_domain
+control_msg_size=8
+endpoint_bandwidth=1000
+eventq_index=0
+ext_links=system.ruby.network.ext_links0 system.ruby.network.ext_links1 system.ruby.network.ext_links2 system.ruby.network.ext_links3 system.ruby.network.ext_links4 system.ruby.network.ext_links5
+int_link_buffers=system.ruby.network.int_link_buffers00 system.ruby.network.int_link_buffers01 system.ruby.network.int_link_buffers02 system.ruby.network.int_link_buffers03 system.ruby.network.int_link_buffers04 system.ruby.network.int_link_buffers05 system.ruby.network.int_link_buffers06 system.ruby.network.int_link_buffers07 system.ruby.network.int_link_buffers08 system.ruby.network.int_link_buffers09 system.ruby.network.int_link_buffers10 system.ruby.network.int_link_buffers11 system.ruby.network.int_link_buffers12 system.ruby.network.int_link_buffers13 system.ruby.network.int_link_buffers14 system.ruby.network.int_link_buffers15 system.ruby.network.int_link_buffers16 system.ruby.network.int_link_buffers17 system.ruby.network.int_link_buffers18 system.ruby.network.int_link_buffers19 system.ruby.network.int_link_buffers20 system.ruby.network.int_link_buffers21 system.ruby.network.int_link_buffers22 system.ruby.network.int_link_buffers23 system.ruby.network.int_link_buffers24 system.ruby.network.int_link_buffers25 system.ruby.network.int_link_buffers26 system.ruby.network.int_link_buffers27 system.ruby.network.int_link_buffers28 system.ruby.network.int_link_buffers29 system.ruby.network.int_link_buffers30 system.ruby.network.int_link_buffers31 system.ruby.network.int_link_buffers32 system.ruby.network.int_link_buffers33 system.ruby.network.int_link_buffers34 system.ruby.network.int_link_buffers35 system.ruby.network.int_link_buffers36 system.ruby.network.int_link_buffers37 system.ruby.network.int_link_buffers38 system.ruby.network.int_link_buffers39
+int_links=system.ruby.network.int_links0 system.ruby.network.int_links1
+netifs=
+number_of_virtual_networks=10
+routers=system.ruby.network.ext_links0.int_node system.ruby.network.ext_links1.int_node system.ruby.network.ext_links2.int_node
+ruby_system=system.ruby
+topology=Crossbar
+master=system.ruby.dir_cntrl0.requestFromCores.slave system.ruby.dir_cntrl0.responseFromCores.slave system.ruby.dir_cntrl0.unblockFromCores.slave system.ruby.cp_cntrl0.probeToCore.slave system.ruby.cp_cntrl0.responseToCore.slave system.ruby.tcp_cntrl0.probeToTCP.slave system.ruby.tcp_cntrl0.responseToTCP.slave system.ruby.tcp_cntrl1.probeToTCP.slave system.ruby.tcp_cntrl1.responseToTCP.slave system.ruby.sqc_cntrl0.probeToSQC.slave system.ruby.sqc_cntrl0.responseToSQC.slave system.ruby.tcc_cntrl0.requestFromTCP.slave system.ruby.tcc_cntrl0.probeFromNB.slave system.ruby.tcc_cntrl0.responseFromNB.slave
+slave=system.ruby.dir_cntrl0.probeToCore.master system.ruby.dir_cntrl0.responseToCore.master system.ruby.cp_cntrl0.requestFromCore.master system.ruby.cp_cntrl0.responseFromCore.master system.ruby.cp_cntrl0.unblockFromCore.master system.ruby.tcp_cntrl0.requestFromTCP.master system.ruby.tcp_cntrl0.responseFromTCP.master system.ruby.tcp_cntrl0.unblockFromCore.master system.ruby.tcp_cntrl1.requestFromTCP.master system.ruby.tcp_cntrl1.responseFromTCP.master system.ruby.tcp_cntrl1.unblockFromCore.master system.ruby.sqc_cntrl0.requestFromSQC.master system.ruby.tcc_cntrl0.responseToCore.master system.ruby.tcc_cntrl0.requestToNB.master system.ruby.tcc_cntrl0.responseToNB.master system.ruby.tcc_cntrl0.unblockToNB.master
+
+[system.ruby.network.ext_links0]
+type=SimpleExtLink
+children=int_node
+bandwidth_factor=8
+eventq_index=0
+ext_node=system.ruby.dir_cntrl0
+int_node=system.ruby.network.ext_links0.int_node
+latency=1
+link_id=0
+weight=1
+
+[system.ruby.network.ext_links0.int_node]
+type=Switch
+children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 port_buffers70 port_buffers71 port_buffers72 port_buffers73 port_buffers74 port_buffers75 port_buffers76 port_buffers77 port_buffers78 port_buffers79
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+port_buffers=system.ruby.network.ext_links0.int_node.port_buffers00 system.ruby.network.ext_links0.int_node.port_buffers01 system.ruby.network.ext_links0.int_node.port_buffers02 system.ruby.network.ext_links0.int_node.port_buffers03 system.ruby.network.ext_links0.int_node.port_buffers04 system.ruby.network.ext_links0.int_node.port_buffers05 system.ruby.network.ext_links0.int_node.port_buffers06 system.ruby.network.ext_links0.int_node.port_buffers07 system.ruby.network.ext_links0.int_node.port_buffers08 system.ruby.network.ext_links0.int_node.port_buffers09 system.ruby.network.ext_links0.int_node.port_buffers10 system.ruby.network.ext_links0.int_node.port_buffers11 system.ruby.network.ext_links0.int_node.port_buffers12 system.ruby.network.ext_links0.int_node.port_buffers13 system.ruby.network.ext_links0.int_node.port_buffers14 system.ruby.network.ext_links0.int_node.port_buffers15 system.ruby.network.ext_links0.int_node.port_buffers16 system.ruby.network.ext_links0.int_node.port_buffers17 system.ruby.network.ext_links0.int_node.port_buffers18 system.ruby.network.ext_links0.int_node.port_buffers19 system.ruby.network.ext_links0.int_node.port_buffers20 system.ruby.network.ext_links0.int_node.port_buffers21 system.ruby.network.ext_links0.int_node.port_buffers22 system.ruby.network.ext_links0.int_node.port_buffers23 system.ruby.network.ext_links0.int_node.port_buffers24 system.ruby.network.ext_links0.int_node.port_buffers25 system.ruby.network.ext_links0.int_node.port_buffers26 system.ruby.network.ext_links0.int_node.port_buffers27 system.ruby.network.ext_links0.int_node.port_buffers28 system.ruby.network.ext_links0.int_node.port_buffers29 system.ruby.network.ext_links0.int_node.port_buffers30 system.ruby.network.ext_links0.int_node.port_buffers31 system.ruby.network.ext_links0.int_node.port_buffers32 system.ruby.network.ext_links0.int_node.port_buffers33 system.ruby.network.ext_links0.int_node.port_buffers34 system.ruby.network.ext_links0.int_node.port_buffers35 system.ruby.network.ext_links0.int_node.port_buffers36 system.ruby.network.ext_links0.int_node.port_buffers37 system.ruby.network.ext_links0.int_node.port_buffers38 system.ruby.network.ext_links0.int_node.port_buffers39 system.ruby.network.ext_links0.int_node.port_buffers40 system.ruby.network.ext_links0.int_node.port_buffers41 system.ruby.network.ext_links0.int_node.port_buffers42 system.ruby.network.ext_links0.int_node.port_buffers43 system.ruby.network.ext_links0.int_node.port_buffers44 system.ruby.network.ext_links0.int_node.port_buffers45 system.ruby.network.ext_links0.int_node.port_buffers46 system.ruby.network.ext_links0.int_node.port_buffers47 system.ruby.network.ext_links0.int_node.port_buffers48 system.ruby.network.ext_links0.int_node.port_buffers49 system.ruby.network.ext_links0.int_node.port_buffers50 system.ruby.network.ext_links0.int_node.port_buffers51 system.ruby.network.ext_links0.int_node.port_buffers52 system.ruby.network.ext_links0.int_node.port_buffers53 system.ruby.network.ext_links0.int_node.port_buffers54 system.ruby.network.ext_links0.int_node.port_buffers55 system.ruby.network.ext_links0.int_node.port_buffers56 system.ruby.network.ext_links0.int_node.port_buffers57 system.ruby.network.ext_links0.int_node.port_buffers58 system.ruby.network.ext_links0.int_node.port_buffers59 system.ruby.network.ext_links0.int_node.port_buffers60 system.ruby.network.ext_links0.int_node.port_buffers61 system.ruby.network.ext_links0.int_node.port_buffers62 system.ruby.network.ext_links0.int_node.port_buffers63 system.ruby.network.ext_links0.int_node.port_buffers64 system.ruby.network.ext_links0.int_node.port_buffers65 system.ruby.network.ext_links0.int_node.port_buffers66 system.ruby.network.ext_links0.int_node.port_buffers67 system.ruby.network.ext_links0.int_node.port_buffers68 system.ruby.network.ext_links0.int_node.port_buffers69 system.ruby.network.ext_links0.int_node.port_buffers70 system.ruby.network.ext_links0.int_node.port_buffers71 system.ruby.network.ext_links0.int_node.port_buffers72 system.ruby.network.ext_links0.int_node.port_buffers73 system.ruby.network.ext_links0.int_node.port_buffers74 system.ruby.network.ext_links0.int_node.port_buffers75 system.ruby.network.ext_links0.int_node.port_buffers76 system.ruby.network.ext_links0.int_node.port_buffers77 system.ruby.network.ext_links0.int_node.port_buffers78 system.ruby.network.ext_links0.int_node.port_buffers79
+router_id=0
+virt_nets=10
+
+[system.ruby.network.ext_links0.int_node.port_buffers00]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers01]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers02]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers03]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers04]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers05]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers06]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers07]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers08]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers09]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers10]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers11]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers12]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers13]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers14]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers15]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers16]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers17]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers18]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers19]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers20]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers21]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers22]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers23]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers24]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers25]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers26]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers27]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers28]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers29]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers30]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers31]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers32]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers33]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers34]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers35]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers36]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers37]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers38]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers39]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers40]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers41]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers42]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers43]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers44]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers45]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers46]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers47]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers48]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers49]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers50]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers51]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers52]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers53]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers54]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers55]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers56]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers57]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers58]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers59]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers60]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers61]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers62]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers63]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers64]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers65]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers66]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers67]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers68]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers69]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers70]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers71]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers72]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers73]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers74]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers75]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers76]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers77]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers78]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers79]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1]
+type=SimpleExtLink
+children=int_node
+bandwidth_factor=8
+eventq_index=0
+ext_node=system.ruby.cp_cntrl0
+int_node=system.ruby.network.ext_links1.int_node
+latency=1
+link_id=1
+weight=1
+
+[system.ruby.network.ext_links1.int_node]
+type=Switch
+children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+port_buffers=system.ruby.network.ext_links1.int_node.port_buffers00 system.ruby.network.ext_links1.int_node.port_buffers01 system.ruby.network.ext_links1.int_node.port_buffers02 system.ruby.network.ext_links1.int_node.port_buffers03 system.ruby.network.ext_links1.int_node.port_buffers04 system.ruby.network.ext_links1.int_node.port_buffers05 system.ruby.network.ext_links1.int_node.port_buffers06 system.ruby.network.ext_links1.int_node.port_buffers07 system.ruby.network.ext_links1.int_node.port_buffers08 system.ruby.network.ext_links1.int_node.port_buffers09 system.ruby.network.ext_links1.int_node.port_buffers10 system.ruby.network.ext_links1.int_node.port_buffers11 system.ruby.network.ext_links1.int_node.port_buffers12 system.ruby.network.ext_links1.int_node.port_buffers13 system.ruby.network.ext_links1.int_node.port_buffers14 system.ruby.network.ext_links1.int_node.port_buffers15 system.ruby.network.ext_links1.int_node.port_buffers16 system.ruby.network.ext_links1.int_node.port_buffers17 system.ruby.network.ext_links1.int_node.port_buffers18 system.ruby.network.ext_links1.int_node.port_buffers19 system.ruby.network.ext_links1.int_node.port_buffers20 system.ruby.network.ext_links1.int_node.port_buffers21 system.ruby.network.ext_links1.int_node.port_buffers22 system.ruby.network.ext_links1.int_node.port_buffers23 system.ruby.network.ext_links1.int_node.port_buffers24 system.ruby.network.ext_links1.int_node.port_buffers25 system.ruby.network.ext_links1.int_node.port_buffers26 system.ruby.network.ext_links1.int_node.port_buffers27 system.ruby.network.ext_links1.int_node.port_buffers28 system.ruby.network.ext_links1.int_node.port_buffers29 system.ruby.network.ext_links1.int_node.port_buffers30 system.ruby.network.ext_links1.int_node.port_buffers31 system.ruby.network.ext_links1.int_node.port_buffers32 system.ruby.network.ext_links1.int_node.port_buffers33 system.ruby.network.ext_links1.int_node.port_buffers34 system.ruby.network.ext_links1.int_node.port_buffers35 system.ruby.network.ext_links1.int_node.port_buffers36 system.ruby.network.ext_links1.int_node.port_buffers37 system.ruby.network.ext_links1.int_node.port_buffers38 system.ruby.network.ext_links1.int_node.port_buffers39 system.ruby.network.ext_links1.int_node.port_buffers40 system.ruby.network.ext_links1.int_node.port_buffers41 system.ruby.network.ext_links1.int_node.port_buffers42 system.ruby.network.ext_links1.int_node.port_buffers43 system.ruby.network.ext_links1.int_node.port_buffers44 system.ruby.network.ext_links1.int_node.port_buffers45 system.ruby.network.ext_links1.int_node.port_buffers46 system.ruby.network.ext_links1.int_node.port_buffers47 system.ruby.network.ext_links1.int_node.port_buffers48 system.ruby.network.ext_links1.int_node.port_buffers49 system.ruby.network.ext_links1.int_node.port_buffers50 system.ruby.network.ext_links1.int_node.port_buffers51 system.ruby.network.ext_links1.int_node.port_buffers52 system.ruby.network.ext_links1.int_node.port_buffers53 system.ruby.network.ext_links1.int_node.port_buffers54 system.ruby.network.ext_links1.int_node.port_buffers55 system.ruby.network.ext_links1.int_node.port_buffers56 system.ruby.network.ext_links1.int_node.port_buffers57 system.ruby.network.ext_links1.int_node.port_buffers58 system.ruby.network.ext_links1.int_node.port_buffers59 system.ruby.network.ext_links1.int_node.port_buffers60 system.ruby.network.ext_links1.int_node.port_buffers61 system.ruby.network.ext_links1.int_node.port_buffers62 system.ruby.network.ext_links1.int_node.port_buffers63 system.ruby.network.ext_links1.int_node.port_buffers64 system.ruby.network.ext_links1.int_node.port_buffers65 system.ruby.network.ext_links1.int_node.port_buffers66 system.ruby.network.ext_links1.int_node.port_buffers67 system.ruby.network.ext_links1.int_node.port_buffers68 system.ruby.network.ext_links1.int_node.port_buffers69
+router_id=1
+virt_nets=10
+
+[system.ruby.network.ext_links1.int_node.port_buffers00]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers01]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers02]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers03]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers04]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers05]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers06]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers07]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers08]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers09]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers10]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers11]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers12]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers13]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers14]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers15]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers16]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers17]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers18]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers19]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers20]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers21]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers22]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers23]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers24]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers25]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers26]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers27]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers28]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers29]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers30]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers31]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers32]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers33]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers34]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers35]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers36]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers37]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers38]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers39]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers40]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers41]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers42]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers43]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers44]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers45]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers46]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers47]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers48]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers49]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers50]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers51]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers52]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers53]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers54]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers55]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers56]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers57]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers58]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers59]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers60]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers61]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers62]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers63]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers64]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers65]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers66]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers67]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers68]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers69]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2]
+type=SimpleExtLink
+children=int_node
+bandwidth_factor=8
+eventq_index=0
+ext_node=system.ruby.tcp_cntrl0
+int_node=system.ruby.network.ext_links2.int_node
+latency=1
+link_id=2
+weight=1
+
+[system.ruby.network.ext_links2.int_node]
+type=Switch
+children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+port_buffers=system.ruby.network.ext_links2.int_node.port_buffers00 system.ruby.network.ext_links2.int_node.port_buffers01 system.ruby.network.ext_links2.int_node.port_buffers02 system.ruby.network.ext_links2.int_node.port_buffers03 system.ruby.network.ext_links2.int_node.port_buffers04 system.ruby.network.ext_links2.int_node.port_buffers05 system.ruby.network.ext_links2.int_node.port_buffers06 system.ruby.network.ext_links2.int_node.port_buffers07 system.ruby.network.ext_links2.int_node.port_buffers08 system.ruby.network.ext_links2.int_node.port_buffers09 system.ruby.network.ext_links2.int_node.port_buffers10 system.ruby.network.ext_links2.int_node.port_buffers11 system.ruby.network.ext_links2.int_node.port_buffers12 system.ruby.network.ext_links2.int_node.port_buffers13 system.ruby.network.ext_links2.int_node.port_buffers14 system.ruby.network.ext_links2.int_node.port_buffers15 system.ruby.network.ext_links2.int_node.port_buffers16 system.ruby.network.ext_links2.int_node.port_buffers17 system.ruby.network.ext_links2.int_node.port_buffers18 system.ruby.network.ext_links2.int_node.port_buffers19 system.ruby.network.ext_links2.int_node.port_buffers20 system.ruby.network.ext_links2.int_node.port_buffers21 system.ruby.network.ext_links2.int_node.port_buffers22 system.ruby.network.ext_links2.int_node.port_buffers23 system.ruby.network.ext_links2.int_node.port_buffers24 system.ruby.network.ext_links2.int_node.port_buffers25 system.ruby.network.ext_links2.int_node.port_buffers26 system.ruby.network.ext_links2.int_node.port_buffers27 system.ruby.network.ext_links2.int_node.port_buffers28 system.ruby.network.ext_links2.int_node.port_buffers29 system.ruby.network.ext_links2.int_node.port_buffers30 system.ruby.network.ext_links2.int_node.port_buffers31 system.ruby.network.ext_links2.int_node.port_buffers32 system.ruby.network.ext_links2.int_node.port_buffers33 system.ruby.network.ext_links2.int_node.port_buffers34 system.ruby.network.ext_links2.int_node.port_buffers35 system.ruby.network.ext_links2.int_node.port_buffers36 system.ruby.network.ext_links2.int_node.port_buffers37 system.ruby.network.ext_links2.int_node.port_buffers38 system.ruby.network.ext_links2.int_node.port_buffers39 system.ruby.network.ext_links2.int_node.port_buffers40 system.ruby.network.ext_links2.int_node.port_buffers41 system.ruby.network.ext_links2.int_node.port_buffers42 system.ruby.network.ext_links2.int_node.port_buffers43 system.ruby.network.ext_links2.int_node.port_buffers44 system.ruby.network.ext_links2.int_node.port_buffers45 system.ruby.network.ext_links2.int_node.port_buffers46 system.ruby.network.ext_links2.int_node.port_buffers47 system.ruby.network.ext_links2.int_node.port_buffers48 system.ruby.network.ext_links2.int_node.port_buffers49 system.ruby.network.ext_links2.int_node.port_buffers50 system.ruby.network.ext_links2.int_node.port_buffers51 system.ruby.network.ext_links2.int_node.port_buffers52 system.ruby.network.ext_links2.int_node.port_buffers53 system.ruby.network.ext_links2.int_node.port_buffers54 system.ruby.network.ext_links2.int_node.port_buffers55 system.ruby.network.ext_links2.int_node.port_buffers56 system.ruby.network.ext_links2.int_node.port_buffers57 system.ruby.network.ext_links2.int_node.port_buffers58 system.ruby.network.ext_links2.int_node.port_buffers59 system.ruby.network.ext_links2.int_node.port_buffers60 system.ruby.network.ext_links2.int_node.port_buffers61 system.ruby.network.ext_links2.int_node.port_buffers62 system.ruby.network.ext_links2.int_node.port_buffers63 system.ruby.network.ext_links2.int_node.port_buffers64 system.ruby.network.ext_links2.int_node.port_buffers65 system.ruby.network.ext_links2.int_node.port_buffers66 system.ruby.network.ext_links2.int_node.port_buffers67 system.ruby.network.ext_links2.int_node.port_buffers68 system.ruby.network.ext_links2.int_node.port_buffers69
+router_id=2
+virt_nets=10
+
+[system.ruby.network.ext_links2.int_node.port_buffers00]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers01]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers02]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers03]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers04]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers05]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers06]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers07]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers08]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers09]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers10]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers11]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers12]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers13]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers14]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers15]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers16]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers17]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers18]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers19]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers20]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers21]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers22]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers23]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers24]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers25]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers26]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers27]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers28]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers29]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers30]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers31]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers32]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers33]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers34]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers35]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers36]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers37]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers38]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers39]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers40]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers41]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers42]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers43]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers44]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers45]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers46]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers47]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers48]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers49]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers50]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers51]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers52]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers53]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers54]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers55]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers56]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers57]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers58]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers59]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers60]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers61]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers62]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers63]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers64]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers65]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers66]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers67]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers68]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers69]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links3]
+type=SimpleExtLink
+bandwidth_factor=8
+eventq_index=0
+ext_node=system.ruby.tcp_cntrl1
+int_node=system.ruby.network.ext_links2.int_node
+latency=1
+link_id=3
+weight=1
+
+[system.ruby.network.ext_links4]
+type=SimpleExtLink
+bandwidth_factor=8
+eventq_index=0
+ext_node=system.ruby.sqc_cntrl0
+int_node=system.ruby.network.ext_links2.int_node
+latency=1
+link_id=4
+weight=1
+
+[system.ruby.network.ext_links5]
+type=SimpleExtLink
+bandwidth_factor=8
+eventq_index=0
+ext_node=system.ruby.tcc_cntrl0
+int_node=system.ruby.network.ext_links2.int_node
+latency=1
+link_id=5
+weight=1
+
+[system.ruby.network.int_link_buffers00]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers01]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers02]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers03]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers04]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers05]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers06]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers07]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers08]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers09]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers10]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers11]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers12]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers13]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers14]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers15]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers16]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers17]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers18]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers19]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers20]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers21]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers22]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers23]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers24]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers25]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers26]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers27]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers28]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers29]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers30]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers31]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers32]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers33]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers34]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers35]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers36]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers37]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers38]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers39]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_links0]
+type=SimpleIntLink
+bandwidth_factor=8
+eventq_index=0
+latency=1
+link_id=0
+node_a=system.ruby.network.ext_links0.int_node
+node_b=system.ruby.network.ext_links1.int_node
+weight=1
+
+[system.ruby.network.int_links1]
+type=SimpleIntLink
+bandwidth_factor=8
+eventq_index=0
+latency=1
+link_id=1
+node_a=system.ruby.network.ext_links0.int_node
+node_b=system.ruby.network.ext_links2.int_node
+weight=1
+
+[system.ruby.phys_mem]
+type=SimpleMemory
+bandwidth=73.000000
+clk_domain=system.ruby.clk_domain
+conf_table_reported=true
+eventq_index=0
+in_addr_map=false
+latency=30000
+latency_var=0
+null=false
+range=0:536870911
+
+[system.ruby.sqc_cntrl0]
+type=SQC_Controller
+children=L1cache mandatoryQueue probeToSQC requestFromSQC responseToSQC sequencer
+L1cache=system.ruby.sqc_cntrl0.L1cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.ruby.clk_domain
+cluster_id=0
+eventq_index=0
+issue_latency=80
+l2_hit_latency=18
+mandatoryQueue=system.ruby.sqc_cntrl0.mandatoryQueue
+number_of_TBEs=256
+probeToSQC=system.ruby.sqc_cntrl0.probeToSQC
+recycle_latency=10
+requestFromSQC=system.ruby.sqc_cntrl0.requestFromSQC
+responseToSQC=system.ruby.sqc_cntrl0.responseToSQC
+ruby_system=system.ruby
+sequencer=system.ruby.sqc_cntrl0.sequencer
+system=system
+transitions_per_cycle=32
+version=0
+
+[system.ruby.sqc_cntrl0.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=8
+eventq_index=0
+is_icache=false
+replacement_policy=system.ruby.sqc_cntrl0.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=32768
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=8
+
+[system.ruby.sqc_cntrl0.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=32768
+
+[system.ruby.sqc_cntrl0.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.ruby.sqc_cntrl0.probeToSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[9]
+
+[system.ruby.sqc_cntrl0.requestFromSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[11]
+
+[system.ruby.sqc_cntrl0.responseToSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[10]
+
+[system.ruby.sqc_cntrl0.sequencer]
+type=RubySequencer
+clk_domain=system.ruby.clk_domain
+coreid=99
+dcache=system.ruby.sqc_cntrl0.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.ruby.sqc_cntrl0.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=false
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=6
+slave=system.cpu1.CUs0.sqc_port system.cpu1.CUs1.sqc_port
+
+[system.ruby.tcc_cntrl0]
+type=TCC_Controller
+children=L2cache probeFromNB requestFromTCP requestToNB responseFromNB responseToCore responseToNB triggerQueue unblockToNB
+L2cache=system.ruby.tcc_cntrl0.L2cache
+WB=false
+buffer_size=0
+clk_domain=system.ruby.clk_domain
+cluster_id=0
+eventq_index=0
+l2_request_latency=120
+l2_response_latency=16
+number_of_TBEs=5120
+probeFromNB=system.ruby.tcc_cntrl0.probeFromNB
+recycle_latency=10
+requestFromTCP=system.ruby.tcc_cntrl0.requestFromTCP
+requestToNB=system.ruby.tcc_cntrl0.requestToNB
+responseFromNB=system.ruby.tcc_cntrl0.responseFromNB
+responseToCore=system.ruby.tcc_cntrl0.responseToCore
+responseToNB=system.ruby.tcc_cntrl0.responseToNB
+ruby_system=system.ruby
+system=system
+transitions_per_cycle=32
+triggerQueue=system.ruby.tcc_cntrl0.triggerQueue
+unblockToNB=system.ruby.tcc_cntrl0.unblockToNB
+version=0
+
+[system.ruby.tcc_cntrl0.L2cache]
+type=RubyCache
+children=replacement_policy
+assoc=16
+block_size=0
+dataAccessLatency=8
+dataArrayBanks=256
+eventq_index=0
+is_icache=false
+replacement_policy=system.ruby.tcc_cntrl0.L2cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=262144
+start_index_bit=6
+tagAccessLatency=2
+tagArrayBanks=256
+
+[system.ruby.tcc_cntrl0.L2cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=16
+block_size=64
+eventq_index=0
+size=262144
+
+[system.ruby.tcc_cntrl0.probeFromNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[12]
+
+[system.ruby.tcc_cntrl0.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[11]
+
+[system.ruby.tcc_cntrl0.requestToNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[13]
+
+[system.ruby.tcc_cntrl0.responseFromNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[13]
+
+[system.ruby.tcc_cntrl0.responseToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[12]
+
+[system.ruby.tcc_cntrl0.responseToNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[14]
+
+[system.ruby.tcc_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.tcc_cntrl0.unblockToNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[15]
+
+[system.ruby.tcp_cntrl0]
+type=TCP_Controller
+children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore
+L1cache=system.ruby.tcp_cntrl0.L1cache
+TCC_select_num_bits=0
+WB=false
+buffer_size=0
+clk_domain=system.ruby.clk_domain
+cluster_id=0
+coalescer=system.ruby.tcp_cntrl0.coalescer
+disableL1=false
+eventq_index=0
+issue_latency=1
+l2_hit_latency=18
+mandatoryQueue=system.ruby.tcp_cntrl0.mandatoryQueue
+number_of_TBEs=2560
+probeToTCP=system.ruby.tcp_cntrl0.probeToTCP
+recycle_latency=10
+requestFromTCP=system.ruby.tcp_cntrl0.requestFromTCP
+responseFromTCP=system.ruby.tcp_cntrl0.responseFromTCP
+responseToTCP=system.ruby.tcp_cntrl0.responseToTCP
+ruby_system=system.ruby
+sequencer=system.ruby.tcp_cntrl0.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.ruby.tcp_cntrl0.unblockFromCore
+use_seq_not_coal=false
+version=0
+
+[system.ruby.tcp_cntrl0.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=16
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.ruby.tcp_cntrl0.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=16384
+start_index_bit=6
+tagAccessLatency=4
+tagArrayBanks=16
+
+[system.ruby.tcp_cntrl0.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=16
+block_size=64
+eventq_index=0
+size=16384
+
+[system.ruby.tcp_cntrl0.coalescer]
+type=VIPERCoalescer
+assume_rfo=false
+clk_domain=system.ruby.clk_domain
+coreid=99
+dcache=system.ruby.tcp_cntrl0.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.ruby.tcp_cntrl0.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_inv_per_cycle=32
+max_outstanding_requests=2560
+max_wb_per_cycle=32
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=false
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=2
+slave=system.cpu1.CUs0.memory_port[0] system.cpu1.CUs0.memory_port[1] system.cpu1.CUs0.memory_port[2] system.cpu1.CUs0.memory_port[3] system.cpu1.CUs0.memory_port[4] system.cpu1.CUs0.memory_port[5] system.cpu1.CUs0.memory_port[6] system.cpu1.CUs0.memory_port[7] system.cpu1.CUs0.memory_port[8] system.cpu1.CUs0.memory_port[9] system.cpu1.CUs0.memory_port[10] system.cpu1.CUs0.memory_port[11] system.cpu1.CUs0.memory_port[12] system.cpu1.CUs0.memory_port[13] system.cpu1.CUs0.memory_port[14] system.cpu1.CUs0.memory_port[15] system.cpu1.CUs0.memory_port[16] system.cpu1.CUs0.memory_port[17] system.cpu1.CUs0.memory_port[18] system.cpu1.CUs0.memory_port[19] system.cpu1.CUs0.memory_port[20] system.cpu1.CUs0.memory_port[21] system.cpu1.CUs0.memory_port[22] system.cpu1.CUs0.memory_port[23] system.cpu1.CUs0.memory_port[24] system.cpu1.CUs0.memory_port[25] system.cpu1.CUs0.memory_port[26] system.cpu1.CUs0.memory_port[27] system.cpu1.CUs0.memory_port[28] system.cpu1.CUs0.memory_port[29] system.cpu1.CUs0.memory_port[30] system.cpu1.CUs0.memory_port[31] system.cpu1.CUs0.memory_port[32] system.cpu1.CUs0.memory_port[33] system.cpu1.CUs0.memory_port[34] system.cpu1.CUs0.memory_port[35] system.cpu1.CUs0.memory_port[36] system.cpu1.CUs0.memory_port[37] system.cpu1.CUs0.memory_port[38] system.cpu1.CUs0.memory_port[39] system.cpu1.CUs0.memory_port[40] system.cpu1.CUs0.memory_port[41] system.cpu1.CUs0.memory_port[42] system.cpu1.CUs0.memory_port[43] system.cpu1.CUs0.memory_port[44] system.cpu1.CUs0.memory_port[45] system.cpu1.CUs0.memory_port[46] system.cpu1.CUs0.memory_port[47] system.cpu1.CUs0.memory_port[48] system.cpu1.CUs0.memory_port[49] system.cpu1.CUs0.memory_port[50] system.cpu1.CUs0.memory_port[51] system.cpu1.CUs0.memory_port[52] system.cpu1.CUs0.memory_port[53] system.cpu1.CUs0.memory_port[54] system.cpu1.CUs0.memory_port[55] system.cpu1.CUs0.memory_port[56] system.cpu1.CUs0.memory_port[57] system.cpu1.CUs0.memory_port[58] system.cpu1.CUs0.memory_port[59] system.cpu1.CUs0.memory_port[60] system.cpu1.CUs0.memory_port[61] system.cpu1.CUs0.memory_port[62] system.cpu1.CUs0.memory_port[63]
+
+[system.ruby.tcp_cntrl0.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.ruby.tcp_cntrl0.probeToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[5]
+
+[system.ruby.tcp_cntrl0.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[5]
+
+[system.ruby.tcp_cntrl0.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[6]
+
+[system.ruby.tcp_cntrl0.responseToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[6]
+
+[system.ruby.tcp_cntrl0.sequencer]
+type=RubySequencer
+clk_domain=system.ruby.clk_domain
+coreid=99
+dcache=system.ruby.tcp_cntrl0.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.ruby.tcp_cntrl0.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=3
+
+[system.ruby.tcp_cntrl0.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[7]
+
+[system.ruby.tcp_cntrl1]
+type=TCP_Controller
+children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore
+L1cache=system.ruby.tcp_cntrl1.L1cache
+TCC_select_num_bits=0
+WB=false
+buffer_size=0
+clk_domain=system.ruby.clk_domain
+cluster_id=0
+coalescer=system.ruby.tcp_cntrl1.coalescer
+disableL1=false
+eventq_index=0
+issue_latency=1
+l2_hit_latency=18
+mandatoryQueue=system.ruby.tcp_cntrl1.mandatoryQueue
+number_of_TBEs=2560
+probeToTCP=system.ruby.tcp_cntrl1.probeToTCP
+recycle_latency=10
+requestFromTCP=system.ruby.tcp_cntrl1.requestFromTCP
+responseFromTCP=system.ruby.tcp_cntrl1.responseFromTCP
+responseToTCP=system.ruby.tcp_cntrl1.responseToTCP
+ruby_system=system.ruby
+sequencer=system.ruby.tcp_cntrl1.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.ruby.tcp_cntrl1.unblockFromCore
+use_seq_not_coal=false
+version=1
+
+[system.ruby.tcp_cntrl1.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=16
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.ruby.tcp_cntrl1.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=16384
+start_index_bit=6
+tagAccessLatency=4
+tagArrayBanks=16
+
+[system.ruby.tcp_cntrl1.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=16
+block_size=64
+eventq_index=0
+size=16384
+
+[system.ruby.tcp_cntrl1.coalescer]
+type=VIPERCoalescer
+assume_rfo=false
+clk_domain=system.ruby.clk_domain
+coreid=99
+dcache=system.ruby.tcp_cntrl1.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.ruby.tcp_cntrl1.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_inv_per_cycle=32
+max_outstanding_requests=2560
+max_wb_per_cycle=32
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=false
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=4
+slave=system.cpu1.CUs1.memory_port[0] system.cpu1.CUs1.memory_port[1] system.cpu1.CUs1.memory_port[2] system.cpu1.CUs1.memory_port[3] system.cpu1.CUs1.memory_port[4] system.cpu1.CUs1.memory_port[5] system.cpu1.CUs1.memory_port[6] system.cpu1.CUs1.memory_port[7] system.cpu1.CUs1.memory_port[8] system.cpu1.CUs1.memory_port[9] system.cpu1.CUs1.memory_port[10] system.cpu1.CUs1.memory_port[11] system.cpu1.CUs1.memory_port[12] system.cpu1.CUs1.memory_port[13] system.cpu1.CUs1.memory_port[14] system.cpu1.CUs1.memory_port[15] system.cpu1.CUs1.memory_port[16] system.cpu1.CUs1.memory_port[17] system.cpu1.CUs1.memory_port[18] system.cpu1.CUs1.memory_port[19] system.cpu1.CUs1.memory_port[20] system.cpu1.CUs1.memory_port[21] system.cpu1.CUs1.memory_port[22] system.cpu1.CUs1.memory_port[23] system.cpu1.CUs1.memory_port[24] system.cpu1.CUs1.memory_port[25] system.cpu1.CUs1.memory_port[26] system.cpu1.CUs1.memory_port[27] system.cpu1.CUs1.memory_port[28] system.cpu1.CUs1.memory_port[29] system.cpu1.CUs1.memory_port[30] system.cpu1.CUs1.memory_port[31] system.cpu1.CUs1.memory_port[32] system.cpu1.CUs1.memory_port[33] system.cpu1.CUs1.memory_port[34] system.cpu1.CUs1.memory_port[35] system.cpu1.CUs1.memory_port[36] system.cpu1.CUs1.memory_port[37] system.cpu1.CUs1.memory_port[38] system.cpu1.CUs1.memory_port[39] system.cpu1.CUs1.memory_port[40] system.cpu1.CUs1.memory_port[41] system.cpu1.CUs1.memory_port[42] system.cpu1.CUs1.memory_port[43] system.cpu1.CUs1.memory_port[44] system.cpu1.CUs1.memory_port[45] system.cpu1.CUs1.memory_port[46] system.cpu1.CUs1.memory_port[47] system.cpu1.CUs1.memory_port[48] system.cpu1.CUs1.memory_port[49] system.cpu1.CUs1.memory_port[50] system.cpu1.CUs1.memory_port[51] system.cpu1.CUs1.memory_port[52] system.cpu1.CUs1.memory_port[53] system.cpu1.CUs1.memory_port[54] system.cpu1.CUs1.memory_port[55] system.cpu1.CUs1.memory_port[56] system.cpu1.CUs1.memory_port[57] system.cpu1.CUs1.memory_port[58] system.cpu1.CUs1.memory_port[59] system.cpu1.CUs1.memory_port[60] system.cpu1.CUs1.memory_port[61] system.cpu1.CUs1.memory_port[62] system.cpu1.CUs1.memory_port[63]
+
+[system.ruby.tcp_cntrl1.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.ruby.tcp_cntrl1.probeToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[7]
+
+[system.ruby.tcp_cntrl1.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[8]
+
+[system.ruby.tcp_cntrl1.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[9]
+
+[system.ruby.tcp_cntrl1.responseToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[8]
+
+[system.ruby.tcp_cntrl1.sequencer]
+type=RubySequencer
+clk_domain=system.ruby.clk_domain
+coreid=99
+dcache=system.ruby.tcp_cntrl1.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.ruby.tcp_cntrl1.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=5
+
+[system.ruby.tcp_cntrl1.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[10]
+
+[system.sqc_coalescer]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.sqc_coalescer.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.sqc_tlb.slave[0]
+slave=system.cpu1.CUs0.sqc_tlb_port system.cpu1.CUs1.sqc_tlb_port
+
+[system.sqc_coalescer.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.sqc_coalescer.clk_domain.voltage_domain
+
+[system.sqc_coalescer.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.sqc_tlb]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.sqc_tlb.clk_domain
+eventq_index=0
+hitLatency=1
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=32
+master=system.l2_coalescer.slave[0]
+slave=system.sqc_coalescer.master[0]
+
+[system.sqc_tlb.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.sqc_tlb.clk_domain.voltage_domain
+
+[system.sqc_tlb.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.sys_port_proxy]
+type=RubyPortProxy
+clk_domain=system.clk_domain
+eventq_index=0
+is_cpu_sequencer=true
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_ruby_tester=false
+version=0
+slave=system.system_port
+
+[system.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/simerr b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/simerr
new file mode 100755
index 000000000..1e2b8911e
--- /dev/null
+++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/simerr
@@ -0,0 +1,5 @@
+warn: system.ruby.network adopting orphan SimObject param 'int_links'
+warn: system.ruby.network adopting orphan SimObject param 'ext_links'
+warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes)
+warn: Sockets disabled, not accepting gdb connections
+warn: Replacement policy updates recently became the responsibility of SLICC state machines. Make sure to setMRU() near callbacks in .sm files!
diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/simout b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/simout
new file mode 100755
index 000000000..3b7ae46db
--- /dev/null
+++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/simout
@@ -0,0 +1,21 @@
+gem5 Simulator System.  http://gem5.org
+gem5 is copyrighted software; use the --copyright option for details.
+
+gem5 compiled Jan 19 2016 13:36:44
+gem5 started Jan 19 2016 13:37:09
+gem5 executing on zizzer, pid 49676
+command line: build/HSAIL_X86/gem5.opt -d build/HSAIL_X86/tests/opt/quick/se/04.gpu/x86/linux/gpu-ruby-GPU_VIPER -re /z/atgutier/gem5/gem5-commit/tests/run.py build/HSAIL_X86/tests/opt/quick/se/04.gpu/x86/linux/gpu-ruby-GPU_VIPER
+
+Using GPU kernel code file(s) /dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm
+Global frequency set at 1000000000000 ticks per second
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+info: Entering event queue @ 0.  Starting simulation...
+keys = 0x7b2bc0, &keys = 0x798998, keys[0] = 23
+the gpu says:
+elloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloe
+Exiting @ tick 314399500 because target called exit()
diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/stats.txt b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/stats.txt
new file mode 100644
index 000000000..7e23ea73c
--- /dev/null
+++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER/stats.txt
@@ -0,0 +1,3201 @@
+
+---------- Begin Simulation Statistics ----------
+sim_seconds                                  0.000314                       # Number of seconds simulated
+sim_ticks                                   314399500                       # Number of ticks simulated
+final_tick                                  314399500                       # Number of ticks from beginning of simulation (restored from checkpoints and never reset)
+sim_freq                                 1000000000000                       # Frequency of simulated ticks
+host_inst_rate                                  59851                       # Simulator instruction rate (inst/s)
+host_op_rate                                   123077                       # Simulator op (including micro ops) rate (op/s)
+host_tick_rate                              280996968                       # Simulator tick rate (ticks/s)
+host_mem_usage                                1296852                       # Number of bytes of host memory used
+host_seconds                                     1.12                       # Real time elapsed on the host
+sim_insts                                       66963                       # Number of instructions simulated
+sim_ops                                        137705                       # Number of ops (including micro ops) simulated
+system.voltage_domain.voltage                       1                       # Voltage in Volts
+system.clk_domain.clock                          1000                       # Clock period in ticks
+system.mem_ctrls.bytes_read::ruby.dir_cntrl0        99840                       # Number of bytes read from this memory
+system.mem_ctrls.bytes_read::total              99840                       # Number of bytes read from this memory
+system.mem_ctrls.num_reads::ruby.dir_cntrl0         1560                       # Number of read requests responded to by this memory
+system.mem_ctrls.num_reads::total                1560                       # Number of read requests responded to by this memory
+system.mem_ctrls.bw_read::ruby.dir_cntrl0    317557757                       # Total read bandwidth from this memory (bytes/s)
+system.mem_ctrls.bw_read::total             317557757                       # Total read bandwidth from this memory (bytes/s)
+system.mem_ctrls.bw_total::ruby.dir_cntrl0    317557757                       # Total bandwidth to/from this memory (bytes/s)
+system.mem_ctrls.bw_total::total            317557757                       # Total bandwidth to/from this memory (bytes/s)
+system.mem_ctrls.readReqs                        1560                       # Number of read requests accepted
+system.mem_ctrls.writeReqs                          0                       # Number of write requests accepted
+system.mem_ctrls.readBursts                      1560                       # Number of DRAM read bursts, including those serviced by the write queue
+system.mem_ctrls.writeBursts                        0                       # Number of DRAM write bursts, including those merged in the write queue
+system.mem_ctrls.bytesReadDRAM                  99840                       # Total number of bytes read from DRAM
+system.mem_ctrls.bytesReadWrQ                       0                       # Total number of bytes read from write queue
+system.mem_ctrls.bytesWritten                       0                       # Total number of bytes written to DRAM
+system.mem_ctrls.bytesReadSys                   99840                       # Total read bytes from the system interface side
+system.mem_ctrls.bytesWrittenSys                    0                       # Total written bytes from the system interface side
+system.mem_ctrls.servicedByWrQ                      0                       # Number of DRAM read bursts serviced by the write queue
+system.mem_ctrls.mergedWrBursts                     0                       # Number of DRAM write bursts merged with an existing one
+system.mem_ctrls.neitherReadNorWriteReqs            0                       # Number of requests that are neither read nor write
+system.mem_ctrls.perBankRdBursts::0               122                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::1               192                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::2                93                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::3                44                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::4                61                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::5                79                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::6                52                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::7                42                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::8                54                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::9                56                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::10              182                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::11               90                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::12              223                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::13              125                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::14               51                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::15               94                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::0                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::1                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::2                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::3                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::4                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::5                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::6                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::7                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::8                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::9                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::10                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::11                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::12                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::13                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::14                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::15                0                       # Per bank write bursts
+system.mem_ctrls.numRdRetry                         0                       # Number of times read queue was full causing retry
+system.mem_ctrls.numWrRetry                         0                       # Number of times write queue was full causing retry
+system.mem_ctrls.totGap                     314257000                       # Total gap between requests
+system.mem_ctrls.readPktSize::0                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::1                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::2                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::3                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::4                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::5                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::6                  1560                       # Read request sizes (log2)
+system.mem_ctrls.writePktSize::0                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::1                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::2                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::3                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::4                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::5                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::6                    0                       # Write request sizes (log2)
+system.mem_ctrls.rdQLenPdf::0                    1544                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::1                       3                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::2                       2                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::3                       2                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::4                       4                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::5                       3                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::6                       1                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::7                       1                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::8                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::9                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::10                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::11                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::12                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::13                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::14                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::15                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::16                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::17                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::18                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::19                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::20                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::21                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::22                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::23                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::24                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::25                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::26                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::27                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::28                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::29                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::30                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::31                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::0                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::1                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::2                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::3                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::4                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::5                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::6                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::7                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::8                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::9                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::10                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::11                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::12                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::13                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::14                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::15                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::16                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::17                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::18                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::19                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::20                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::21                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::22                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::23                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::24                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::25                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::26                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::27                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::28                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::29                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::30                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::31                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::32                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::33                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::34                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::35                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::36                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::37                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::38                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::39                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::40                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::41                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::42                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::43                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::44                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::45                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::46                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::47                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::48                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::49                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::50                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::51                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::52                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::53                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::54                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::55                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::56                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::57                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::58                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::59                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::60                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::61                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::62                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::63                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.bytesPerActivate::samples          398                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::mean    247.798995                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::gmean   164.777646                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::stdev   248.151006                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::0-127          138     34.67%     34.67% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::128-255          115     28.89%     63.57% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::256-383           55     13.82%     77.39% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::384-511           30      7.54%     84.92% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::512-639           19      4.77%     89.70% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::640-767           13      3.27%     92.96% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::768-895            7      1.76%     94.72% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::896-1023            7      1.76%     96.48% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::1024-1151           14      3.52%    100.00% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::total          398                       # Bytes accessed per row activation
+system.mem_ctrls.totQLat                     12586250                       # Total ticks spent queuing
+system.mem_ctrls.totMemAccLat                41836250                       # Total ticks spent from burst creation until serviced by the DRAM
+system.mem_ctrls.totBusLat                    7800000                       # Total ticks spent in databus transfers
+system.mem_ctrls.avgQLat                      8068.11                       # Average queueing delay per DRAM burst
+system.mem_ctrls.avgBusLat                    5000.00                       # Average bus latency per DRAM burst
+system.mem_ctrls.avgMemAccLat                26818.11                       # Average memory access latency per DRAM burst
+system.mem_ctrls.avgRdBW                       317.56                       # Average DRAM read bandwidth in MiByte/s
+system.mem_ctrls.avgWrBW                         0.00                       # Average achieved write bandwidth in MiByte/s
+system.mem_ctrls.avgRdBWSys                    317.56                       # Average system read bandwidth in MiByte/s
+system.mem_ctrls.avgWrBWSys                      0.00                       # Average system write bandwidth in MiByte/s
+system.mem_ctrls.peakBW                      12800.00                       # Theoretical peak bandwidth in MiByte/s
+system.mem_ctrls.busUtil                         2.48                       # Data bus utilization in percentage
+system.mem_ctrls.busUtilRead                     2.48                       # Data bus utilization in percentage for reads
+system.mem_ctrls.busUtilWrite                    0.00                       # Data bus utilization in percentage for writes
+system.mem_ctrls.avgRdQLen                       1.04                       # Average read queue length when enqueuing
+system.mem_ctrls.avgWrQLen                       0.00                       # Average write queue length when enqueuing
+system.mem_ctrls.readRowHits                     1157                       # Number of row buffer hits during reads
+system.mem_ctrls.writeRowHits                       0                       # Number of row buffer hits during writes
+system.mem_ctrls.readRowHitRate                 74.17                       # Row buffer hit rate for reads
+system.mem_ctrls.writeRowHitRate                  nan                       # Row buffer hit rate for writes
+system.mem_ctrls.avgGap                     201446.79                       # Average gap between requests
+system.mem_ctrls.pageHitRate                    74.17                       # Row buffer hit rate, read and write combined
+system.mem_ctrls_0.actEnergy                  1141560                       # Energy for activate commands per rank (pJ)
+system.mem_ctrls_0.preEnergy                   622875                       # Energy for precharge commands per rank (pJ)
+system.mem_ctrls_0.readEnergy                 5335200                       # Energy for read commands per rank (pJ)
+system.mem_ctrls_0.writeEnergy                      0                       # Energy for write commands per rank (pJ)
+system.mem_ctrls_0.refreshEnergy             20342400                       # Energy for refresh commands per rank (pJ)
+system.mem_ctrls_0.actBackEnergy            179243055                       # Energy for active background per rank (pJ)
+system.mem_ctrls_0.preBackEnergy             29795250                       # Energy for precharge background per rank (pJ)
+system.mem_ctrls_0.totalEnergy              236480340                       # Total energy per rank (pJ)
+system.mem_ctrls_0.averagePower            758.654968                       # Core power per rank (mW)
+system.mem_ctrls_0.memoryStateTime::IDLE     51073000                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::REF      10400000                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::PRE_PDN            0                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::ACT     252847000                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::ACT_PDN            0                       # Time in different power states
+system.mem_ctrls_1.actEnergy                  1867320                       # Energy for activate commands per rank (pJ)
+system.mem_ctrls_1.preEnergy                  1018875                       # Energy for precharge commands per rank (pJ)
+system.mem_ctrls_1.readEnergy                 6684600                       # Energy for read commands per rank (pJ)
+system.mem_ctrls_1.writeEnergy                      0                       # Energy for write commands per rank (pJ)
+system.mem_ctrls_1.refreshEnergy             20342400                       # Energy for refresh commands per rank (pJ)
+system.mem_ctrls_1.actBackEnergy            198048780                       # Energy for active background per rank (pJ)
+system.mem_ctrls_1.preBackEnergy             13299000                       # Energy for precharge background per rank (pJ)
+system.mem_ctrls_1.totalEnergy              241260975                       # Total energy per rank (pJ)
+system.mem_ctrls_1.averagePower            773.991771                       # Core power per rank (mW)
+system.mem_ctrls_1.memoryStateTime::IDLE     20941500                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::REF      10400000                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::PRE_PDN            0                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::ACT     280382250                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::ACT_PDN            0                       # Time in different power states
+system.ruby.clk_domain.clock                      500                       # Clock period in ticks
+system.ruby.phys_mem.bytes_read::cpu0.inst       696760                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_read::cpu0.data       119832                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_read::cpu1.CUs0.ComputeUnit         3280                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_read::cpu1.CUs1.ComputeUnit         3280                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_read::total         823152                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_inst_read::cpu0.inst       696760                       # Number of instructions bytes read from this memory
+system.ruby.phys_mem.bytes_inst_read::cpu1.CUs0.ComputeUnit         2000                       # Number of instructions bytes read from this memory
+system.ruby.phys_mem.bytes_inst_read::cpu1.CUs1.ComputeUnit         2000                       # Number of instructions bytes read from this memory
+system.ruby.phys_mem.bytes_inst_read::total       700760                       # Number of instructions bytes read from this memory
+system.ruby.phys_mem.bytes_written::cpu0.data        72767                       # Number of bytes written to this memory
+system.ruby.phys_mem.bytes_written::cpu1.CUs0.ComputeUnit          256                       # Number of bytes written to this memory
+system.ruby.phys_mem.bytes_written::cpu1.CUs1.ComputeUnit          256                       # Number of bytes written to this memory
+system.ruby.phys_mem.bytes_written::total        73279                       # Number of bytes written to this memory
+system.ruby.phys_mem.num_reads::cpu0.inst        87095                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_reads::cpu0.data        16686                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_reads::cpu1.CUs0.ComputeUnit          555                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_reads::cpu1.CUs1.ComputeUnit          555                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_reads::total          104891                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_writes::cpu0.data        10422                       # Number of write requests responded to by this memory
+system.ruby.phys_mem.num_writes::cpu1.CUs0.ComputeUnit          256                       # Number of write requests responded to by this memory
+system.ruby.phys_mem.num_writes::cpu1.CUs1.ComputeUnit          256                       # Number of write requests responded to by this memory
+system.ruby.phys_mem.num_writes::total          10934                       # Number of write requests responded to by this memory
+system.ruby.phys_mem.bw_read::cpu0.inst    2216161285                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_read::cpu0.data     381145644                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_read::cpu1.CUs0.ComputeUnit     10432587                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_read::cpu1.CUs1.ComputeUnit     10432587                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_read::total        2618172103                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_inst_read::cpu0.inst   2216161285                       # Instruction read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_inst_read::cpu1.CUs0.ComputeUnit      6361333                       # Instruction read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_inst_read::cpu1.CUs1.ComputeUnit      6361333                       # Instruction read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_inst_read::total   2228883952                       # Instruction read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_write::cpu0.data    231447569                       # Write bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_write::cpu1.CUs0.ComputeUnit       814251                       # Write bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_write::cpu1.CUs1.ComputeUnit       814251                       # Write bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_write::total        233076070                       # Write bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::cpu0.inst   2216161285                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::cpu0.data    612593213                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::cpu1.CUs0.ComputeUnit     11246837                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::cpu1.CUs1.ComputeUnit     11246837                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::total       2851248173                       # Total bandwidth to/from this memory (bytes/s)
+system.cpu0.clk_domain.clock                      500                       # Clock period in ticks
+system.cpu0.apic_clk_domain.clock                8000                       # Clock period in ticks
+system.cpu0.workload.num_syscalls                  21                       # Number of system calls
+system.cpu0.numCycles                          628799                       # number of cpu cycles simulated
+system.cpu0.numWorkItemsStarted                     0                       # number of work items this cpu started
+system.cpu0.numWorkItemsCompleted                   0                       # number of work items this cpu completed
+system.cpu0.committedInsts                      66963                       # Number of instructions committed
+system.cpu0.committedOps                       137705                       # Number of ops (including micro ops) committed
+system.cpu0.num_int_alu_accesses               136380                       # Number of integer alu accesses
+system.cpu0.num_fp_alu_accesses                  1279                       # Number of float alu accesses
+system.cpu0.num_func_calls                       3196                       # number of times a function call or return occured
+system.cpu0.num_conditional_control_insts        12151                       # number of instructions that are conditional controls
+system.cpu0.num_int_insts                      136380                       # number of integer instructions
+system.cpu0.num_fp_insts                         1279                       # number of float instructions
+system.cpu0.num_int_register_reads             257490                       # number of times the integer registers were read
+system.cpu0.num_int_register_writes            110039                       # number of times the integer registers were written
+system.cpu0.num_fp_register_reads                1981                       # number of times the floating registers were read
+system.cpu0.num_fp_register_writes                981                       # number of times the floating registers were written
+system.cpu0.num_cc_register_reads               78262                       # number of times the CC registers were read
+system.cpu0.num_cc_register_writes              42183                       # number of times the CC registers were written
+system.cpu0.num_mem_refs                        27198                       # number of memory refs
+system.cpu0.num_load_insts                      16684                       # Number of load instructions
+system.cpu0.num_store_insts                     10514                       # Number of store instructions
+system.cpu0.num_idle_cycles               8671.003972                       # Number of idle cycles
+system.cpu0.num_busy_cycles              620127.996028                       # Number of busy cycles
+system.cpu0.not_idle_fraction                0.986210                       # Percentage of non-idle cycles
+system.cpu0.idle_fraction                    0.013790                       # Percentage of idle cycles
+system.cpu0.Branches                            16199                       # Number of branches fetched
+system.cpu0.op_class::No_OpClass                  615      0.45%      0.45% # Class of executed instruction
+system.cpu0.op_class::IntAlu                   108791     79.00%     79.45% # Class of executed instruction
+system.cpu0.op_class::IntMult                      13      0.01%     79.46% # Class of executed instruction
+system.cpu0.op_class::IntDiv                      138      0.10%     79.56% # Class of executed instruction
+system.cpu0.op_class::FloatAdd                    950      0.69%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatCmp                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatCvt                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatMult                     0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatDiv                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatSqrt                     0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdAdd                       0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdAddAcc                    0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdAlu                       0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdCmp                       0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdCvt                       0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdMisc                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdMult                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdMultAcc                   0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdShift                     0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdShiftAcc                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdSqrt                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatAdd                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatAlu                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatCmp                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatCvt                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatDiv                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatMisc                 0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatMult                 0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatMultAcc              0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatSqrt                 0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::MemRead                   16684     12.12%     92.36% # Class of executed instruction
+system.cpu0.op_class::MemWrite                  10514      7.64%    100.00% # Class of executed instruction
+system.cpu0.op_class::IprAccess                     0      0.00%    100.00% # Class of executed instruction
+system.cpu0.op_class::InstPrefetch                  0      0.00%    100.00% # Class of executed instruction
+system.cpu0.op_class::total                    137705                       # Class of executed instruction
+system.cpu1.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.cpu1.clk_domain.clock                     1000                       # Clock period in ticks
+system.cpu1.CUs0.wavefronts00.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts00.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts00.timesBlockedDueRAWDependencies          216                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::samples           39                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::mean     0.794872                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::stdev     0.863880                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::0-1           28     71.79%     71.79% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::2-3           11     28.21%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::total           39                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::samples           39                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::mean     0.589744                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::stdev     0.498310                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::0-1           39    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::total           39                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts01.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts01.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts02.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts02.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts03.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts03.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts04.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts04.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts05.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts05.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts06.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts06.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts07.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts07.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts08.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts08.timesBlockedDueRAWDependencies          195                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts09.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts09.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts10.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts10.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts11.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts11.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts12.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts12.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts13.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts13.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts14.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts14.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts15.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts15.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts16.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts16.timesBlockedDueRAWDependencies          194                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts17.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts17.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts18.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts18.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts19.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts19.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts20.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts20.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts21.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts21.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts22.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts22.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts23.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts23.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts24.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts24.timesBlockedDueRAWDependencies          177                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts25.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts25.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts26.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts26.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts27.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts27.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts28.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts28.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts29.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts29.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts30.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts30.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts31.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts31.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::samples           43                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::mean     5.813953                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::stdev     2.683777                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::underflows            0      0.00%      0.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::1            0      0.00%      0.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::2            8     18.60%     18.60% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::3            8     18.60%     37.21% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::4            1      2.33%     39.53% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::5            0      0.00%     39.53% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::6            1      2.33%     41.86% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::7            0      0.00%     41.86% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::8           25     58.14%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::9            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::10            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::11            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::12            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::13            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::14            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::15            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::16            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::17            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::18            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::19            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::20            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::21            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::22            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::23            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::24            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::25            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::26            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::27            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::28            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::29            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::30            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::31            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::32            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::overflows            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::min_value            2                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::max_value            8                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::total           43                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.ExecStage.num_cycles_with_no_issue         4663                       # number of cycles the CU issues nothing
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_issued          102                       # number of cycles the CU issued at least one instruction
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU0           30                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU1           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU2           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU3           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::GM           18                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::LM            6                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU0         1993                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU1          288                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU2          325                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU3          248                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::GM          341                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::LM           27                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.spc::samples          4765                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::mean         0.029591                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::stdev        0.214321                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::underflows            0      0.00%      0.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::0                4663     97.86%     97.86% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::1                  65      1.36%     99.22% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::2                  35      0.73%     99.96% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::3                   2      0.04%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::4                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::5                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::6                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::overflows            0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::min_value            0                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::max_value            3                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::total            4765                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.num_transitions_active_to_idle           66                       # number of CU transitions from active to idle
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::samples           66                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::mean    61.575758                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::stdev   253.572448                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::underflows            0      0.00%      0.00% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::0-4           45     68.18%     68.18% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::5-9           10     15.15%     83.33% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::10-14            0      0.00%     83.33% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::15-19            1      1.52%     84.85% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::20-24            2      3.03%     87.88% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::25-29            1      1.52%     89.39% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::30-34            0      0.00%     89.39% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::35-39            0      0.00%     89.39% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::40-44            0      0.00%     89.39% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::45-49            0      0.00%     89.39% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::50-54            0      0.00%     89.39% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::55-59            0      0.00%     89.39% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::60-64            0      0.00%     89.39% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::65-69            0      0.00%     89.39% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::70-74            0      0.00%     89.39% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::75            0      0.00%     89.39% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::overflows            7     10.61%    100.00% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::min_value            1                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::max_value         1685                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::total           66                       # duration of idle periods in cycles
+system.cpu1.CUs0.GlobalMemPipeline.load_vrf_bank_conflict_cycles            0                       # total number of cycles GM data are delayed before updating the VRF
+system.cpu1.CUs0.LocalMemPipeline.load_vrf_bank_conflict_cycles            0                       # total number of cycles LDS data are delayed before updating the VRF
+system.cpu1.CUs0.tlb_requests                     769                       # number of uncoalesced requests
+system.cpu1.CUs0.tlb_cycles              -212991640500                       # total number of cycles for all uncoalesced requests
+system.cpu1.CUs0.avg_translation_latency -276972224.317295                       # Avg. translation latency for data translations
+system.cpu1.CUs0.TLB_hits_distribution::page_table          769                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs0.TLB_hits_distribution::L1_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs0.TLB_hits_distribution::L2_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs0.TLB_hits_distribution::L3_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs0.lds_bank_access_cnt               54                       # Total number of LDS bank accesses
+system.cpu1.CUs0.lds_bank_conflicts::samples            6                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::mean            8                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::stdev     6.196773                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::underflows            0      0.00%      0.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::0-1            2     33.33%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::2-3            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::4-5            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::6-7            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::8-9            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::10-11            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::12-13            4     66.67%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::14-15            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::16-17            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::18-19            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::20-21            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::22-23            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::24-25            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::26-27            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::28-29            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::30-31            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::32-33            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::34-35            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::36-37            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::38-39            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::40-41            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::42-43            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::44-45            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::46-47            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::48-49            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::50-51            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::52-53            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::54-55            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::56-57            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::58-59            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::60-61            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::62-63            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::64             0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::overflows            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::min_value            0                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::max_value           12                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::total            6                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.page_divergence_dist::samples           17                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::mean            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::stdev            0                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::underflows            0      0.00%      0.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::1-4           17    100.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::5-8            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::9-12            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::13-16            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::17-20            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::21-24            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::25-28            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::29-32            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::33-36            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::37-40            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::41-44            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::45-48            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::49-52            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::53-56            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::57-60            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::61-64            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::overflows            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::min_value            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::max_value            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::total           17                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.global_mem_instr_cnt              17                       # dynamic global memory instructions count
+system.cpu1.CUs0.local_mem_instr_cnt                6                       # dynamic local memory intruction count
+system.cpu1.CUs0.wg_blocked_due_lds_alloc            0                       # Workgroup blocked due to LDS capacity
+system.cpu1.CUs0.num_instr_executed               141                       # number of instructions executed
+system.cpu1.CUs0.inst_exec_rate::samples          141                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::mean       81.602837                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::stdev     244.924445                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::underflows            0      0.00%      0.00% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::0-1                1      0.71%      0.71% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::2-3               12      8.51%      9.22% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::4-5               57     40.43%     49.65% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::6-7               28     19.86%     69.50% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::8-9                2      1.42%     70.92% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::10                 1      0.71%     71.63% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::overflows           40     28.37%    100.00% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::min_value            1                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::max_value         1686                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::total            141                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.num_vec_ops_executed            6769                       # number of vec ops executed (e.g. VSZ/inst)
+system.cpu1.CUs0.num_total_cycles                4765                       # number of cycles the CU ran for
+system.cpu1.CUs0.vpc                         1.420567                       # Vector Operations per cycle (this CU only)
+system.cpu1.CUs0.ipc                         0.029591                       # Instructions per cycle (this CU only)
+system.cpu1.CUs0.warp_execution_dist::samples          141                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::mean    48.007092                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::stdev    23.719942                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::underflows            0      0.00%      0.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::1-4            5      3.55%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::5-8            0      0.00%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::9-12            0      0.00%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::13-16           36     25.53%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::17-20            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::21-24            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::25-28            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::29-32            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::33-36            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::37-40            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::41-44            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::45-48            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::49-52            8      5.67%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::53-56            0      0.00%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::57-60            0      0.00%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::61-64           92     65.25%    100.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::overflows            0      0.00%    100.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::min_value            1                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::max_value           64                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::total          141                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.gmem_lanes_execution_dist::samples           18                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::mean    37.833333                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::stdev    27.064737                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::underflows            0      0.00%      0.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::1-4            1      5.56%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::5-8            0      0.00%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::9-12            0      0.00%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::13-16            8     44.44%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::17-20            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::21-24            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::25-28            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::29-32            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::33-36            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::37-40            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::41-44            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::45-48            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::49-52            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::53-56            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::57-60            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::61-64            9     50.00%    100.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::overflows            0      0.00%    100.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::min_value            1                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::max_value           64                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::total           18                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::samples            6                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::mean    19.500000                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::stdev    22.322634                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::underflows            0      0.00%      0.00% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::1-4            1     16.67%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::5-8            0      0.00%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::9-12            0      0.00%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::13-16            4     66.67%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::17-20            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::21-24            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::25-28            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::29-32            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::33-36            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::37-40            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::41-44            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::45-48            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::49-52            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::53-56            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::57-60            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::61-64            1     16.67%    100.00% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::overflows            0      0.00%    100.00% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::min_value            1                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::max_value           64                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::total            6                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.num_alu_insts_executed           118                       # Number of dynamic non-GM memory insts executed
+system.cpu1.CUs0.times_wg_blocked_due_vgpr_alloc            0                       # Number of times WGs are blocked due to VGPR allocation per SIMD
+system.cpu1.CUs0.num_CAS_ops                        0                       # number of compare and swap operations
+system.cpu1.CUs0.num_failed_CAS_ops                 0                       # number of compare and swap operations that failed
+system.cpu1.CUs0.num_completed_wfs                  4                       # number of completed wavefronts
+system.cpu1.CUs1.wavefronts00.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts00.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts00.timesBlockedDueRAWDependencies          216                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::samples           39                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::mean     0.794872                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::stdev     0.863880                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::0-1           28     71.79%     71.79% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::2-3           11     28.21%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::total           39                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::samples           39                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::mean     0.589744                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::stdev     0.498310                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::0-1           39    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::total           39                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts01.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts01.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts02.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts02.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts03.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts03.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts04.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts04.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts05.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts05.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts06.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts06.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts07.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts07.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts08.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts08.timesBlockedDueRAWDependencies          195                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts09.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts09.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts10.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts10.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts11.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts11.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts12.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts12.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts13.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts13.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts14.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts14.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts15.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts15.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts16.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts16.timesBlockedDueRAWDependencies          190                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts17.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts17.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts18.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts18.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts19.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts19.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts20.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts20.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts21.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts21.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts22.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts22.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts23.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts23.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts24.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts24.timesBlockedDueRAWDependencies          176                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts25.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts25.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts26.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts26.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts27.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts27.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts28.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts28.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts29.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts29.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts30.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts30.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts31.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts31.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::samples           43                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::mean     5.813953                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::stdev     2.683777                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::underflows            0      0.00%      0.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::1            0      0.00%      0.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::2            8     18.60%     18.60% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::3            8     18.60%     37.21% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::4            1      2.33%     39.53% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::5            0      0.00%     39.53% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::6            1      2.33%     41.86% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::7            0      0.00%     41.86% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::8           25     58.14%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::9            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::10            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::11            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::12            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::13            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::14            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::15            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::16            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::17            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::18            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::19            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::20            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::21            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::22            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::23            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::24            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::25            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::26            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::27            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::28            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::29            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::30            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::31            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::32            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::overflows            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::min_value            2                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::max_value            8                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::total           43                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.ExecStage.num_cycles_with_no_issue         4667                       # number of cycles the CU issues nothing
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_issued           98                       # number of cycles the CU issued at least one instruction
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU0           30                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU1           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU2           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU3           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::GM           18                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::LM            6                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU0         2052                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU1          327                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU2          265                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU3          285                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::GM          341                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::LM           32                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.spc::samples          4765                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::mean         0.029591                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::stdev        0.218204                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::underflows            0      0.00%      0.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::0                4667     97.94%     97.94% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::1                  57      1.20%     99.14% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::2                  39      0.82%     99.96% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::3                   2      0.04%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::4                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::5                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::6                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::overflows            0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::min_value            0                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::max_value            3                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::total            4765                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.num_transitions_active_to_idle           68                       # number of CU transitions from active to idle
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::samples           68                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::mean           61                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::stdev   257.808908                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::underflows            0      0.00%      0.00% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::0-4           49     72.06%     72.06% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::5-9            8     11.76%     83.82% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::10-14            0      0.00%     83.82% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::15-19            2      2.94%     86.76% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::20-24            1      1.47%     88.24% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::25-29            1      1.47%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::30-34            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::35-39            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::40-44            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::45-49            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::50-54            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::55-59            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::60-64            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::65-69            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::70-74            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::75            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::overflows            7     10.29%    100.00% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::min_value            1                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::max_value         1764                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::total           68                       # duration of idle periods in cycles
+system.cpu1.CUs1.GlobalMemPipeline.load_vrf_bank_conflict_cycles            0                       # total number of cycles GM data are delayed before updating the VRF
+system.cpu1.CUs1.LocalMemPipeline.load_vrf_bank_conflict_cycles            0                       # total number of cycles LDS data are delayed before updating the VRF
+system.cpu1.CUs1.tlb_requests                     769                       # number of uncoalesced requests
+system.cpu1.CUs1.tlb_cycles              -212991830500                       # total number of cycles for all uncoalesced requests
+system.cpu1.CUs1.avg_translation_latency -276972471.391417                       # Avg. translation latency for data translations
+system.cpu1.CUs1.TLB_hits_distribution::page_table          769                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs1.TLB_hits_distribution::L1_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs1.TLB_hits_distribution::L2_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs1.TLB_hits_distribution::L3_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs1.lds_bank_access_cnt               53                       # Total number of LDS bank accesses
+system.cpu1.CUs1.lds_bank_conflicts::samples            6                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::mean     7.833333                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::stdev     6.080022                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::underflows            0      0.00%      0.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::0-1            2     33.33%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::2-3            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::4-5            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::6-7            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::8-9            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::10-11            1     16.67%     50.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::12-13            3     50.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::14-15            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::16-17            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::18-19            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::20-21            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::22-23            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::24-25            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::26-27            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::28-29            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::30-31            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::32-33            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::34-35            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::36-37            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::38-39            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::40-41            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::42-43            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::44-45            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::46-47            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::48-49            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::50-51            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::52-53            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::54-55            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::56-57            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::58-59            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::60-61            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::62-63            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::64             0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::overflows            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::min_value            0                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::max_value           12                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::total            6                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.page_divergence_dist::samples           17                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::mean            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::stdev            0                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::underflows            0      0.00%      0.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::1-4           17    100.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::5-8            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::9-12            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::13-16            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::17-20            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::21-24            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::25-28            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::29-32            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::33-36            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::37-40            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::41-44            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::45-48            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::49-52            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::53-56            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::57-60            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::61-64            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::overflows            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::min_value            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::max_value            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::total           17                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.global_mem_instr_cnt              17                       # dynamic global memory instructions count
+system.cpu1.CUs1.local_mem_instr_cnt                6                       # dynamic local memory intruction count
+system.cpu1.CUs1.wg_blocked_due_lds_alloc            0                       # Workgroup blocked due to LDS capacity
+system.cpu1.CUs1.num_instr_executed               141                       # number of instructions executed
+system.cpu1.CUs1.inst_exec_rate::samples          141                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::mean       82.212766                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::stdev     248.914352                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::underflows            0      0.00%      0.00% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::0-1                1      0.71%      0.71% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::2-3               12      8.51%      9.22% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::4-5               53     37.59%     46.81% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::6-7               28     19.86%     66.67% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::8-9                5      3.55%     70.21% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::10                 1      0.71%     70.92% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::overflows           41     29.08%    100.00% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::min_value            1                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::max_value         1765                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::total            141                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.num_vec_ops_executed            6762                       # number of vec ops executed (e.g. VSZ/inst)
+system.cpu1.CUs1.num_total_cycles                4765                       # number of cycles the CU ran for
+system.cpu1.CUs1.vpc                         1.419098                       # Vector Operations per cycle (this CU only)
+system.cpu1.CUs1.ipc                         0.029591                       # Instructions per cycle (this CU only)
+system.cpu1.CUs1.warp_execution_dist::samples          141                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::mean    47.957447                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::stdev    23.818022                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::underflows            0      0.00%      0.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::1-4            5      3.55%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::5-8            0      0.00%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::9-12            9      6.38%      9.93% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::13-16           27     19.15%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::17-20            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::21-24            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::25-28            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::29-32            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::33-36            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::37-40            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::41-44            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::45-48            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::49-52            8      5.67%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::53-56            0      0.00%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::57-60            0      0.00%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::61-64           92     65.25%    100.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::overflows            0      0.00%    100.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::min_value            1                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::max_value           64                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::total          141                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.gmem_lanes_execution_dist::samples           18                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::mean    37.722222                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::stdev    27.174394                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::underflows            0      0.00%      0.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::1-4            1      5.56%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::5-8            0      0.00%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::9-12            2     11.11%     16.67% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::13-16            6     33.33%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::17-20            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::21-24            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::25-28            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::29-32            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::33-36            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::37-40            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::41-44            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::45-48            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::49-52            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::53-56            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::57-60            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::61-64            9     50.00%    100.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::overflows            0      0.00%    100.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::min_value            1                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::max_value           64                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::total           18                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::samples            6                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::mean    19.333333                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::stdev    22.384518                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::underflows            0      0.00%      0.00% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::1-4            1     16.67%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::5-8            0      0.00%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::9-12            1     16.67%     33.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::13-16            3     50.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::17-20            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::21-24            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::25-28            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::29-32            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::33-36            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::37-40            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::41-44            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::45-48            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::49-52            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::53-56            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::57-60            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::61-64            1     16.67%    100.00% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::overflows            0      0.00%    100.00% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::min_value            1                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::max_value           64                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::total            6                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.num_alu_insts_executed           118                       # Number of dynamic non-GM memory insts executed
+system.cpu1.CUs1.times_wg_blocked_due_vgpr_alloc            0                       # Number of times WGs are blocked due to VGPR allocation per SIMD
+system.cpu1.CUs1.num_CAS_ops                        0                       # number of compare and swap operations
+system.cpu1.CUs1.num_failed_CAS_ops                 0                       # number of compare and swap operations that failed
+system.cpu1.CUs1.num_completed_wfs                  4                       # number of completed wavefronts
+system.cpu2.num_kernel_launched                     1                       # number of kernel launched
+system.dispatcher_coalescer.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.dispatcher_coalescer.clk_domain.clock         1000                       # Clock period in ticks
+system.dispatcher_coalescer.uncoalesced_accesses            0                       # Number of uncoalesced TLB accesses
+system.dispatcher_coalescer.coalesced_accesses            0                       # Number of coalesced TLB accesses
+system.dispatcher_coalescer.queuing_cycles            0                       # Number of cycles spent in queue
+system.dispatcher_coalescer.local_queuing_cycles            0                       # Number of cycles spent in queue for all incoming reqs
+system.dispatcher_coalescer.local_latency          nan                       # Avg. latency over all incoming pkts
+system.dispatcher_tlb.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.dispatcher_tlb.clk_domain.clock           1000                       # Clock period in ticks
+system.dispatcher_tlb.local_TLB_accesses            0                       # Number of TLB accesses
+system.dispatcher_tlb.local_TLB_hits                0                       # Number of TLB hits
+system.dispatcher_tlb.local_TLB_misses              0                       # Number of TLB misses
+system.dispatcher_tlb.local_TLB_miss_rate          nan                       # TLB miss rate
+system.dispatcher_tlb.global_TLB_accesses            0                       # Number of TLB accesses
+system.dispatcher_tlb.global_TLB_hits               0                       # Number of TLB hits
+system.dispatcher_tlb.global_TLB_misses             0                       # Number of TLB misses
+system.dispatcher_tlb.global_TLB_miss_rate          nan                       # TLB miss rate
+system.dispatcher_tlb.access_cycles                 0                       # Cycles spent accessing this TLB level
+system.dispatcher_tlb.page_table_cycles             0                       # Cycles spent accessing the page table
+system.dispatcher_tlb.unique_pages                  0                       # Number of unique pages touched
+system.dispatcher_tlb.local_cycles                  0                       # Number of cycles spent in queue for all incoming reqs
+system.dispatcher_tlb.local_latency               nan                       # Avg. latency over incoming coalesced reqs
+system.dispatcher_tlb.avg_reuse_distance            0                       # avg. reuse distance over all pages (in ticks)
+system.l1_coalescer0.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l1_coalescer0.clk_domain.clock            1000                       # Clock period in ticks
+system.l1_coalescer0.uncoalesced_accesses          778                       # Number of uncoalesced TLB accesses
+system.l1_coalescer0.coalesced_accesses             0                       # Number of coalesced TLB accesses
+system.l1_coalescer0.queuing_cycles                 0                       # Number of cycles spent in queue
+system.l1_coalescer0.local_queuing_cycles            0                       # Number of cycles spent in queue for all incoming reqs
+system.l1_coalescer0.local_latency                  0                       # Avg. latency over all incoming pkts
+system.l1_coalescer1.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l1_coalescer1.clk_domain.clock            1000                       # Clock period in ticks
+system.l1_coalescer1.uncoalesced_accesses          769                       # Number of uncoalesced TLB accesses
+system.l1_coalescer1.coalesced_accesses             0                       # Number of coalesced TLB accesses
+system.l1_coalescer1.queuing_cycles                 0                       # Number of cycles spent in queue
+system.l1_coalescer1.local_queuing_cycles            0                       # Number of cycles spent in queue for all incoming reqs
+system.l1_coalescer1.local_latency                  0                       # Avg. latency over all incoming pkts
+system.l1_tlb0.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l1_tlb0.clk_domain.clock                  1000                       # Clock period in ticks
+system.l1_tlb0.local_TLB_accesses                 778                       # Number of TLB accesses
+system.l1_tlb0.local_TLB_hits                     774                       # Number of TLB hits
+system.l1_tlb0.local_TLB_misses                     4                       # Number of TLB misses
+system.l1_tlb0.local_TLB_miss_rate           0.514139                       # TLB miss rate
+system.l1_tlb0.global_TLB_accesses                778                       # Number of TLB accesses
+system.l1_tlb0.global_TLB_hits                    774                       # Number of TLB hits
+system.l1_tlb0.global_TLB_misses                    4                       # Number of TLB misses
+system.l1_tlb0.global_TLB_miss_rate          0.514139                       # TLB miss rate
+system.l1_tlb0.access_cycles                        0                       # Cycles spent accessing this TLB level
+system.l1_tlb0.page_table_cycles                    0                       # Cycles spent accessing the page table
+system.l1_tlb0.unique_pages                         4                       # Number of unique pages touched
+system.l1_tlb0.local_cycles                         0                       # Number of cycles spent in queue for all incoming reqs
+system.l1_tlb0.local_latency                        0                       # Avg. latency over incoming coalesced reqs
+system.l1_tlb0.avg_reuse_distance                   0                       # avg. reuse distance over all pages (in ticks)
+system.l1_tlb1.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l1_tlb1.clk_domain.clock                  1000                       # Clock period in ticks
+system.l1_tlb1.local_TLB_accesses                 769                       # Number of TLB accesses
+system.l1_tlb1.local_TLB_hits                     766                       # Number of TLB hits
+system.l1_tlb1.local_TLB_misses                     3                       # Number of TLB misses
+system.l1_tlb1.local_TLB_miss_rate           0.390117                       # TLB miss rate
+system.l1_tlb1.global_TLB_accesses                769                       # Number of TLB accesses
+system.l1_tlb1.global_TLB_hits                    766                       # Number of TLB hits
+system.l1_tlb1.global_TLB_misses                    3                       # Number of TLB misses
+system.l1_tlb1.global_TLB_miss_rate          0.390117                       # TLB miss rate
+system.l1_tlb1.access_cycles                        0                       # Cycles spent accessing this TLB level
+system.l1_tlb1.page_table_cycles                    0                       # Cycles spent accessing the page table
+system.l1_tlb1.unique_pages                         3                       # Number of unique pages touched
+system.l1_tlb1.local_cycles                         0                       # Number of cycles spent in queue for all incoming reqs
+system.l1_tlb1.local_latency                        0                       # Avg. latency over incoming coalesced reqs
+system.l1_tlb1.avg_reuse_distance                   0                       # avg. reuse distance over all pages (in ticks)
+system.l2_coalescer.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l2_coalescer.clk_domain.clock             1000                       # Clock period in ticks
+system.l2_coalescer.uncoalesced_accesses            8                       # Number of uncoalesced TLB accesses
+system.l2_coalescer.coalesced_accesses              1                       # Number of coalesced TLB accesses
+system.l2_coalescer.queuing_cycles               8000                       # Number of cycles spent in queue
+system.l2_coalescer.local_queuing_cycles         1000                       # Number of cycles spent in queue for all incoming reqs
+system.l2_coalescer.local_latency                 125                       # Avg. latency over all incoming pkts
+system.l2_tlb.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l2_tlb.clk_domain.clock                   1000                       # Clock period in ticks
+system.l2_tlb.local_TLB_accesses                    8                       # Number of TLB accesses
+system.l2_tlb.local_TLB_hits                        3                       # Number of TLB hits
+system.l2_tlb.local_TLB_misses                      5                       # Number of TLB misses
+system.l2_tlb.local_TLB_miss_rate           62.500000                       # TLB miss rate
+system.l2_tlb.global_TLB_accesses                  15                       # Number of TLB accesses
+system.l2_tlb.global_TLB_hits                       3                       # Number of TLB hits
+system.l2_tlb.global_TLB_misses                    12                       # Number of TLB misses
+system.l2_tlb.global_TLB_miss_rate                 80                       # TLB miss rate
+system.l2_tlb.access_cycles                    552008                       # Cycles spent accessing this TLB level
+system.l2_tlb.page_table_cycles                     0                       # Cycles spent accessing the page table
+system.l2_tlb.unique_pages                          5                       # Number of unique pages touched
+system.l2_tlb.local_cycles                      69001                       # Number of cycles spent in queue for all incoming reqs
+system.l2_tlb.local_latency               8625.125000                       # Avg. latency over incoming coalesced reqs
+system.l2_tlb.avg_reuse_distance                    0                       # avg. reuse distance over all pages (in ticks)
+system.l3_coalescer.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l3_coalescer.clk_domain.clock             1000                       # Clock period in ticks
+system.l3_coalescer.uncoalesced_accesses            5                       # Number of uncoalesced TLB accesses
+system.l3_coalescer.coalesced_accesses              1                       # Number of coalesced TLB accesses
+system.l3_coalescer.queuing_cycles               8000                       # Number of cycles spent in queue
+system.l3_coalescer.local_queuing_cycles         1000                       # Number of cycles spent in queue for all incoming reqs
+system.l3_coalescer.local_latency                 200                       # Avg. latency over all incoming pkts
+system.l3_tlb.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l3_tlb.clk_domain.clock                   1000                       # Clock period in ticks
+system.l3_tlb.local_TLB_accesses                    5                       # Number of TLB accesses
+system.l3_tlb.local_TLB_hits                        0                       # Number of TLB hits
+system.l3_tlb.local_TLB_misses                      5                       # Number of TLB misses
+system.l3_tlb.local_TLB_miss_rate                 100                       # TLB miss rate
+system.l3_tlb.global_TLB_accesses                  12                       # Number of TLB accesses
+system.l3_tlb.global_TLB_hits                       0                       # Number of TLB hits
+system.l3_tlb.global_TLB_misses                    12                       # Number of TLB misses
+system.l3_tlb.global_TLB_miss_rate                100                       # TLB miss rate
+system.l3_tlb.access_cycles                   1200000                       # Cycles spent accessing this TLB level
+system.l3_tlb.page_table_cycles               6000000                       # Cycles spent accessing the page table
+system.l3_tlb.unique_pages                          5                       # Number of unique pages touched
+system.l3_tlb.local_cycles                     150000                       # Number of cycles spent in queue for all incoming reqs
+system.l3_tlb.local_latency                     30000                       # Avg. latency over incoming coalesced reqs
+system.l3_tlb.avg_reuse_distance                    0                       # avg. reuse distance over all pages (in ticks)
+system.piobus.trans_dist::WriteReq                 94                       # Transaction distribution
+system.piobus.trans_dist::WriteResp                94                       # Transaction distribution
+system.piobus.pkt_count_system.ruby.cp_cntrl0.sequencer.mem-master-port::system.cpu2.pio          188                       # Packet count per connected master and slave (bytes)
+system.piobus.pkt_count::total                    188                       # Packet count per connected master and slave (bytes)
+system.piobus.pkt_size_system.ruby.cp_cntrl0.sequencer.mem-master-port::system.cpu2.pio          748                       # Cumulative packet size per connected master and slave (bytes)
+system.piobus.pkt_size::total                     748                       # Cumulative packet size per connected master and slave (bytes)
+system.piobus.reqLayer0.occupancy              234500                       # Layer occupancy (ticks)
+system.piobus.reqLayer0.utilization               0.1                       # Layer utilization (%)
+system.piobus.respLayer0.occupancy              94000                       # Layer occupancy (ticks)
+system.piobus.respLayer0.utilization              0.0                       # Layer utilization (%)
+system.ruby.outstanding_req_hist::bucket_size            1                      
+system.ruby.outstanding_req_hist::max_bucket            9                      
+system.ruby.outstanding_req_hist::samples       114203                      
+system.ruby.outstanding_req_hist::mean       1.000035                      
+system.ruby.outstanding_req_hist::gmean      1.000024                      
+system.ruby.outstanding_req_hist::stdev      0.005918                      
+system.ruby.outstanding_req_hist         |           0      0.00%      0.00% |      114199    100.00%    100.00% |           4      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.outstanding_req_hist::total        114203                      
+system.ruby.latency_hist::bucket_size             128                      
+system.ruby.latency_hist::max_bucket             1279                      
+system.ruby.latency_hist::samples              114203                      
+system.ruby.latency_hist::mean               4.423518                      
+system.ruby.latency_hist::gmean              1.078765                      
+system.ruby.latency_hist::stdev             30.010569                      
+system.ruby.latency_hist                 |      112668     98.66%     98.66% |        1136      0.99%     99.65% |         372      0.33%     99.98% |           3      0.00%     99.98% |           8      0.01%     99.99% |          14      0.01%    100.00% |           2      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.latency_hist::total                114203                      
+system.ruby.hit_latency_hist::bucket_size          128                      
+system.ruby.hit_latency_hist::max_bucket         1279                      
+system.ruby.hit_latency_hist::samples            1535                      
+system.ruby.hit_latency_hist::mean         255.015635                      
+system.ruby.hit_latency_hist::gmean        251.519163                      
+system.ruby.hit_latency_hist::stdev         57.825523                      
+system.ruby.hit_latency_hist             |           0      0.00%      0.00% |        1136     74.01%     74.01% |         372     24.23%     98.24% |           3      0.20%     98.44% |           8      0.52%     98.96% |          14      0.91%     99.87% |           2      0.13%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.hit_latency_hist::total              1535                      
+system.ruby.miss_latency_hist::bucket_size            2                      
+system.ruby.miss_latency_hist::max_bucket           19                      
+system.ruby.miss_latency_hist::samples         112668                      
+system.ruby.miss_latency_hist::mean          1.009426                      
+system.ruby.miss_latency_hist::gmean         1.001543                      
+system.ruby.miss_latency_hist::stdev         0.411800                      
+system.ruby.miss_latency_hist            |      112609     99.95%     99.95% |           0      0.00%     99.95% |           0      0.00%     99.95% |           0      0.00%     99.95% |           0      0.00%     99.95% |           0      0.00%     99.95% |           0      0.00%     99.95% |           0      0.00%     99.95% |           0      0.00%     99.95% |          59      0.05%    100.00%
+system.ruby.miss_latency_hist::total           112668                      
+system.ruby.L1Cache.incomplete_times           112609                      
+system.ruby.L2Cache.incomplete_times               59                      
+system.ruby.cp_cntrl0.L1D0cache.demand_hits            0                       # Number of cache demand hits
+system.ruby.cp_cntrl0.L1D0cache.demand_misses          506                       # Number of cache demand misses
+system.ruby.cp_cntrl0.L1D0cache.demand_accesses          506                       # Number of cache demand accesses
+system.ruby.cp_cntrl0.L1D0cache.num_data_array_reads        16155                       # number of data array reads
+system.ruby.cp_cntrl0.L1D0cache.num_data_array_writes        11985                       # number of data array writes
+system.ruby.cp_cntrl0.L1D0cache.num_tag_array_reads        27132                       # number of tag array reads
+system.ruby.cp_cntrl0.L1D0cache.num_tag_array_writes         1584                       # number of tag array writes
+system.ruby.cp_cntrl0.L1D1cache.demand_hits            0                       # Number of cache demand hits
+system.ruby.cp_cntrl0.L1D1cache.demand_misses            0                       # Number of cache demand misses
+system.ruby.cp_cntrl0.L1D1cache.demand_accesses            0                       # Number of cache demand accesses
+system.ruby.cp_cntrl0.L1Icache.demand_hits            0                       # Number of cache demand hits
+system.ruby.cp_cntrl0.L1Icache.demand_misses         1088                       # Number of cache demand misses
+system.ruby.cp_cntrl0.L1Icache.demand_accesses         1088                       # Number of cache demand accesses
+system.ruby.cp_cntrl0.L1Icache.num_data_array_reads        86007                       # number of data array reads
+system.ruby.cp_cntrl0.L1Icache.num_data_array_writes           54                       # number of data array writes
+system.ruby.cp_cntrl0.L1Icache.num_tag_array_reads        87684                       # number of tag array reads
+system.ruby.cp_cntrl0.L1Icache.num_tag_array_writes           54                       # number of tag array writes
+system.ruby.cp_cntrl0.L2cache.demand_hits            0                       # Number of cache demand hits
+system.ruby.cp_cntrl0.L2cache.demand_misses         1535                       # Number of cache demand misses
+system.ruby.cp_cntrl0.L2cache.demand_accesses         1535                       # Number of cache demand accesses
+system.ruby.cp_cntrl0.L2cache.num_data_array_reads          120                       # number of data array reads
+system.ruby.cp_cntrl0.L2cache.num_data_array_writes        11982                       # number of data array writes
+system.ruby.cp_cntrl0.L2cache.num_tag_array_reads        12068                       # number of tag array reads
+system.ruby.cp_cntrl0.L2cache.num_tag_array_writes         1658                       # number of tag array writes
+system.ruby.dir_cntrl0.L3CacheMemory.demand_hits            0                       # Number of cache demand hits
+system.ruby.dir_cntrl0.L3CacheMemory.demand_misses            0                       # Number of cache demand misses
+system.ruby.dir_cntrl0.L3CacheMemory.demand_accesses            0                       # Number of cache demand accesses
+system.ruby.dir_cntrl0.L3CacheMemory.num_data_array_writes         1560                       # number of data array writes
+system.ruby.dir_cntrl0.L3CacheMemory.num_tag_array_reads         1560                       # number of tag array reads
+system.ruby.dir_cntrl0.L3CacheMemory.num_tag_array_writes         1578                       # number of tag array writes
+system.ruby.network.ext_links0.int_node.percent_links_utilized     1.075754                      
+system.ruby.network.ext_links0.int_node.msg_count.Control::0         1560                      
+system.ruby.network.ext_links0.int_node.msg_count.Data::0           18                      
+system.ruby.network.ext_links0.int_node.msg_count.Request_Control::0         1542                      
+system.ruby.network.ext_links0.int_node.msg_count.Response_Data::2         1546                      
+system.ruby.network.ext_links0.int_node.msg_count.Response_Control::2         1558                      
+system.ruby.network.ext_links0.int_node.msg_count.Writeback_Control::2           16                      
+system.ruby.network.ext_links0.int_node.msg_count.Unblock_Control::4         1541                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Control::0        12480                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Data::0         1296                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Request_Control::0        12336                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Response_Data::2       111312                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Response_Control::2        12464                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Writeback_Control::2          128                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Unblock_Control::4        12328                      
+system.ruby.network.ext_links1.int_node.percent_links_utilized     1.347807                      
+system.ruby.network.ext_links1.int_node.msg_count.Control::0           25                      
+system.ruby.network.ext_links1.int_node.msg_count.Request_Control::0         1535                      
+system.ruby.network.ext_links1.int_node.msg_count.Response_Data::2         1537                      
+system.ruby.network.ext_links1.int_node.msg_count.Response_Control::2           23                      
+system.ruby.network.ext_links1.int_node.msg_count.Unblock_Control::4         1534                      
+system.ruby.network.ext_links1.int_node.msg_bytes.Control::0          200                      
+system.ruby.network.ext_links1.int_node.msg_bytes.Request_Control::0        12280                      
+system.ruby.network.ext_links1.int_node.msg_bytes.Response_Data::2       110664                      
+system.ruby.network.ext_links1.int_node.msg_bytes.Response_Control::2          184                      
+system.ruby.network.ext_links1.int_node.msg_bytes.Unblock_Control::4        12272                      
+system.ruby.tcp_cntrl0.L1cache.demand_hits            0                       # Number of cache demand hits
+system.ruby.tcp_cntrl0.L1cache.demand_misses            0                       # Number of cache demand misses
+system.ruby.tcp_cntrl0.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.ruby.tcp_cntrl0.L1cache.num_data_array_reads            6                       # number of data array reads
+system.ruby.tcp_cntrl0.L1cache.num_data_array_writes           11                       # number of data array writes
+system.ruby.tcp_cntrl0.L1cache.num_tag_array_reads         1297                       # number of tag array reads
+system.ruby.tcp_cntrl0.L1cache.num_tag_array_writes           11                       # number of tag array writes
+system.ruby.tcp_cntrl0.L1cache.num_tag_array_stalls         5082                       # number of stalls caused by tag array
+system.ruby.tcp_cntrl0.L1cache.num_data_array_stalls            6                       # number of stalls caused by data array
+system.ruby.tcp_cntrl0.coalescer.gpu_tcp_ld_hits            0                       # loads that hit in the TCP
+system.ruby.tcp_cntrl0.coalescer.gpu_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.ruby.tcp_cntrl0.coalescer.gpu_tcc_ld_hits            0                       # loads that hit in the TCC
+system.ruby.tcp_cntrl0.coalescer.gpu_ld_misses            5                       # loads that miss in the GPU
+system.ruby.tcp_cntrl0.coalescer.gpu_tcp_st_hits            0                       # stores that hit in the TCP
+system.ruby.tcp_cntrl0.coalescer.gpu_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.ruby.tcp_cntrl0.coalescer.gpu_tcc_st_hits            0                       # stores that hit in the TCC
+system.ruby.tcp_cntrl0.coalescer.gpu_st_misses            9                       # stores that miss in the GPU
+system.ruby.tcp_cntrl0.coalescer.cp_tcp_ld_hits            0                       # loads that hit in the TCP
+system.ruby.tcp_cntrl0.coalescer.cp_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.ruby.tcp_cntrl0.coalescer.cp_tcc_ld_hits            0                       # loads that hit in the TCC
+system.ruby.tcp_cntrl0.coalescer.cp_ld_misses            0                       # loads that miss in the GPU
+system.ruby.tcp_cntrl0.coalescer.cp_tcp_st_hits            0                       # stores that hit in the TCP
+system.ruby.tcp_cntrl0.coalescer.cp_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.ruby.tcp_cntrl0.coalescer.cp_tcc_st_hits            0                       # stores that hit in the TCC
+system.ruby.tcp_cntrl0.coalescer.cp_st_misses            0                       # stores that miss in the GPU
+system.ruby.network.ext_links2.int_node.percent_links_utilized     0.115426                      
+system.ruby.network.ext_links2.int_node.msg_count.Control::0         1535                      
+system.ruby.network.ext_links2.int_node.msg_count.Data::0           18                      
+system.ruby.network.ext_links2.int_node.msg_count.Data::1           18                      
+system.ruby.network.ext_links2.int_node.msg_count.Request_Control::0            7                      
+system.ruby.network.ext_links2.int_node.msg_count.Request_Control::1            9                      
+system.ruby.network.ext_links2.int_node.msg_count.Response_Data::2            9                      
+system.ruby.network.ext_links2.int_node.msg_count.Response_Data::3           11                      
+system.ruby.network.ext_links2.int_node.msg_count.Response_Control::2         1535                      
+system.ruby.network.ext_links2.int_node.msg_count.Writeback_Control::2           16                      
+system.ruby.network.ext_links2.int_node.msg_count.Writeback_Control::3           16                      
+system.ruby.network.ext_links2.int_node.msg_count.Unblock_Control::4            7                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Control::0        12280                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Data::0         1296                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Data::1         1296                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::0           56                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::1           72                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Response_Data::2          648                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Response_Data::3          792                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Response_Control::2        12280                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Writeback_Control::2          128                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Writeback_Control::3          128                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Unblock_Control::4           56                      
+system.ruby.tcp_cntrl1.L1cache.demand_hits            0                       # Number of cache demand hits
+system.ruby.tcp_cntrl1.L1cache.demand_misses            0                       # Number of cache demand misses
+system.ruby.tcp_cntrl1.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.ruby.tcp_cntrl1.L1cache.num_data_array_reads            6                       # number of data array reads
+system.ruby.tcp_cntrl1.L1cache.num_data_array_writes           11                       # number of data array writes
+system.ruby.tcp_cntrl1.L1cache.num_tag_array_reads         1297                       # number of tag array reads
+system.ruby.tcp_cntrl1.L1cache.num_tag_array_writes           11                       # number of tag array writes
+system.ruby.tcp_cntrl1.L1cache.num_tag_array_stalls         5082                       # number of stalls caused by tag array
+system.ruby.tcp_cntrl1.L1cache.num_data_array_stalls            6                       # number of stalls caused by data array
+system.ruby.tcp_cntrl1.coalescer.gpu_tcp_ld_hits            0                       # loads that hit in the TCP
+system.ruby.tcp_cntrl1.coalescer.gpu_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.ruby.tcp_cntrl1.coalescer.gpu_tcc_ld_hits            0                       # loads that hit in the TCC
+system.ruby.tcp_cntrl1.coalescer.gpu_ld_misses            5                       # loads that miss in the GPU
+system.ruby.tcp_cntrl1.coalescer.gpu_tcp_st_hits            0                       # stores that hit in the TCP
+system.ruby.tcp_cntrl1.coalescer.gpu_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.ruby.tcp_cntrl1.coalescer.gpu_tcc_st_hits            0                       # stores that hit in the TCC
+system.ruby.tcp_cntrl1.coalescer.gpu_st_misses            9                       # stores that miss in the GPU
+system.ruby.tcp_cntrl1.coalescer.cp_tcp_ld_hits            0                       # loads that hit in the TCP
+system.ruby.tcp_cntrl1.coalescer.cp_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.ruby.tcp_cntrl1.coalescer.cp_tcc_ld_hits            0                       # loads that hit in the TCC
+system.ruby.tcp_cntrl1.coalescer.cp_ld_misses            0                       # loads that miss in the GPU
+system.ruby.tcp_cntrl1.coalescer.cp_tcp_st_hits            0                       # stores that hit in the TCP
+system.ruby.tcp_cntrl1.coalescer.cp_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.ruby.tcp_cntrl1.coalescer.cp_tcc_st_hits            0                       # stores that hit in the TCC
+system.ruby.tcp_cntrl1.coalescer.cp_st_misses            0                       # stores that miss in the GPU
+system.ruby.sqc_cntrl0.L1cache.demand_hits            0                       # Number of cache demand hits
+system.ruby.sqc_cntrl0.L1cache.demand_misses            0                       # Number of cache demand misses
+system.ruby.sqc_cntrl0.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.ruby.sqc_cntrl0.L1cache.num_data_array_reads           86                       # number of data array reads
+system.ruby.sqc_cntrl0.L1cache.num_tag_array_reads           91                       # number of tag array reads
+system.ruby.sqc_cntrl0.L1cache.num_tag_array_writes           10                       # number of tag array writes
+system.ruby.sqc_cntrl0.sequencer.load_waiting_on_load           97                       # Number of times a load aliased with a pending load
+system.ruby.tcc_cntrl0.L2cache.demand_hits            0                       # Number of cache demand hits
+system.ruby.tcc_cntrl0.L2cache.demand_misses            0                       # Number of cache demand misses
+system.ruby.tcc_cntrl0.L2cache.demand_accesses            0                       # Number of cache demand accesses
+system.ruby.tcc_cntrl0.L2cache.num_data_array_writes            9                       # number of data array writes
+system.ruby.tcc_cntrl0.L2cache.num_tag_array_reads         1569                       # number of tag array reads
+system.ruby.tcc_cntrl0.L2cache.num_tag_array_writes         1545                       # number of tag array writes
+system.ruby.tcc_cntrl0.L2cache.num_tag_array_stalls            1                       # number of stalls caused by tag array
+system.ruby.network.msg_count.Control            3120                      
+system.ruby.network.msg_count.Data                 54                      
+system.ruby.network.msg_count.Request_Control         3093                      
+system.ruby.network.msg_count.Response_Data         3103                      
+system.ruby.network.msg_count.Response_Control         3116                      
+system.ruby.network.msg_count.Writeback_Control           48                      
+system.ruby.network.msg_count.Unblock_Control         3082                      
+system.ruby.network.msg_byte.Control            24960                      
+system.ruby.network.msg_byte.Data                3888                      
+system.ruby.network.msg_byte.Request_Control        24744                      
+system.ruby.network.msg_byte.Response_Data       223416                      
+system.ruby.network.msg_byte.Response_Control        24928                      
+system.ruby.network.msg_byte.Writeback_Control          384                      
+system.ruby.network.msg_byte.Unblock_Control        24656                      
+system.sqc_coalescer.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.sqc_coalescer.clk_domain.clock            1000                       # Clock period in ticks
+system.sqc_coalescer.uncoalesced_accesses           86                       # Number of uncoalesced TLB accesses
+system.sqc_coalescer.coalesced_accesses            48                       # Number of coalesced TLB accesses
+system.sqc_coalescer.queuing_cycles            211000                       # Number of cycles spent in queue
+system.sqc_coalescer.local_queuing_cycles       211000                       # Number of cycles spent in queue for all incoming reqs
+system.sqc_coalescer.local_latency        2453.488372                       # Avg. latency over all incoming pkts
+system.sqc_tlb.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.sqc_tlb.clk_domain.clock                  1000                       # Clock period in ticks
+system.sqc_tlb.local_TLB_accesses                  48                       # Number of TLB accesses
+system.sqc_tlb.local_TLB_hits                      47                       # Number of TLB hits
+system.sqc_tlb.local_TLB_misses                     1                       # Number of TLB misses
+system.sqc_tlb.local_TLB_miss_rate           2.083333                       # TLB miss rate
+system.sqc_tlb.global_TLB_accesses                 86                       # Number of TLB accesses
+system.sqc_tlb.global_TLB_hits                     78                       # Number of TLB hits
+system.sqc_tlb.global_TLB_misses                    8                       # Number of TLB misses
+system.sqc_tlb.global_TLB_miss_rate          9.302326                       # TLB miss rate
+system.sqc_tlb.access_cycles                    86008                       # Cycles spent accessing this TLB level
+system.sqc_tlb.page_table_cycles                    0                       # Cycles spent accessing the page table
+system.sqc_tlb.unique_pages                         1                       # Number of unique pages touched
+system.sqc_tlb.local_cycles                     48001                       # Number of cycles spent in queue for all incoming reqs
+system.sqc_tlb.local_latency              1000.020833                       # Avg. latency over incoming coalesced reqs
+system.sqc_tlb.avg_reuse_distance                   0                       # avg. reuse distance over all pages (in ticks)
+system.ruby.network.ext_links0.int_node.throttle0.link_utilization     0.766700                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Data::0           18                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Request_Control::0         1542                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Response_Data::2            2                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Response_Control::2         1558                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Unblock_Control::4         1541                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Data::0         1296                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Request_Control::0        12336                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Response_Data::2          144                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Response_Control::2        12464                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Unblock_Control::4        12328                      
+system.ruby.network.ext_links0.int_node.throttle1.link_utilization     2.201021                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_count.Control::0           25                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_count.Response_Data::2         1535                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Control::0          200                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Response_Data::2       110520                      
+system.ruby.network.ext_links0.int_node.throttle2.link_utilization     0.259542                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_count.Control::0         1535                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_count.Response_Data::2            9                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_count.Writeback_Control::2           16                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Control::0        12280                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Response_Data::2          648                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Writeback_Control::2          128                      
+system.ruby.network.ext_links1.int_node.throttle0.link_utilization     2.201021                      
+system.ruby.network.ext_links1.int_node.throttle0.msg_count.Control::0           25                      
+system.ruby.network.ext_links1.int_node.throttle0.msg_count.Response_Data::2         1535                      
+system.ruby.network.ext_links1.int_node.throttle0.msg_bytes.Control::0          200                      
+system.ruby.network.ext_links1.int_node.throttle0.msg_bytes.Response_Data::2       110520                      
+system.ruby.network.ext_links1.int_node.throttle1.link_utilization     0.494594                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_count.Request_Control::0         1535                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_count.Response_Data::2            2                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_count.Response_Control::2           23                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_count.Unblock_Control::4         1534                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Request_Control::0        12280                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Response_Data::2          144                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Response_Control::2          184                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Unblock_Control::4        12272                      
+system.ruby.network.ext_links2.int_node.throttle0.link_utilization     0.005566                      
+system.ruby.network.ext_links2.int_node.throttle0.msg_count.Response_Data::3            3                      
+system.ruby.network.ext_links2.int_node.throttle0.msg_count.Writeback_Control::3            8                      
+system.ruby.network.ext_links2.int_node.throttle0.msg_bytes.Response_Data::3          216                      
+system.ruby.network.ext_links2.int_node.throttle0.msg_bytes.Writeback_Control::3           64                      
+system.ruby.network.ext_links2.int_node.throttle1.link_utilization     0.005566                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_count.Response_Data::3            3                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_count.Writeback_Control::3            8                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Response_Data::3          216                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Writeback_Control::3           64                      
+system.ruby.network.ext_links2.int_node.throttle2.link_utilization     0.286737                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_count.Control::0         1535                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_count.Data::1           18                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_count.Request_Control::1            9                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_count.Response_Data::2            9                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_count.Writeback_Control::2           16                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Control::0        12280                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Data::1         1296                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Request_Control::1           72                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Response_Data::2          648                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Writeback_Control::2          128                      
+system.ruby.network.ext_links2.int_node.throttle3.link_utilization     0.007156                      
+system.ruby.network.ext_links2.int_node.throttle3.msg_count.Response_Data::3            5                      
+system.ruby.network.ext_links2.int_node.throttle3.msg_bytes.Response_Data::3          360                      
+system.ruby.network.ext_links2.int_node.throttle4.link_utilization     0.272106                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_count.Data::0           18                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_count.Request_Control::0            7                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_count.Response_Control::2         1535                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_count.Unblock_Control::4            7                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Data::0         1296                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Request_Control::0           56                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Response_Control::2        12280                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Unblock_Control::4           56                      
+system.ruby.LD.latency_hist::bucket_size          128                      
+system.ruby.LD.latency_hist::max_bucket          1279                      
+system.ruby.LD.latency_hist::samples            16335                      
+system.ruby.LD.latency_hist::mean            3.784451                      
+system.ruby.LD.latency_hist::gmean           1.062267                      
+system.ruby.LD.latency_hist::stdev          27.056562                      
+system.ruby.LD.latency_hist              |       16160     98.93%     98.93% |          90      0.55%     99.48% |          84      0.51%     99.99% |           0      0.00%     99.99% |           0      0.00%     99.99% |           1      0.01%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.latency_hist::total              16335                      
+system.ruby.LD.hit_latency_hist::bucket_size          128                      
+system.ruby.LD.hit_latency_hist::max_bucket         1279                      
+system.ruby.LD.hit_latency_hist::samples          175                      
+system.ruby.LD.hit_latency_hist::mean      260.394286                      
+system.ruby.LD.hit_latency_hist::gmean     258.339713                      
+system.ruby.LD.hit_latency_hist::stdev      42.039376                      
+system.ruby.LD.hit_latency_hist          |           0      0.00%      0.00% |          90     51.43%     51.43% |          84     48.00%     99.43% |           0      0.00%     99.43% |           0      0.00%     99.43% |           1      0.57%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.hit_latency_hist::total            175                      
+system.ruby.LD.miss_latency_hist::bucket_size            2                      
+system.ruby.LD.miss_latency_hist::max_bucket           19                      
+system.ruby.LD.miss_latency_hist::samples        16160                      
+system.ruby.LD.miss_latency_hist::mean       1.005569                      
+system.ruby.LD.miss_latency_hist::gmean      1.000911                      
+system.ruby.LD.miss_latency_hist::stdev      0.316580                      
+system.ruby.LD.miss_latency_hist         |       16155     99.97%     99.97% |           0      0.00%     99.97% |           0      0.00%     99.97% |           0      0.00%     99.97% |           0      0.00%     99.97% |           0      0.00%     99.97% |           0      0.00%     99.97% |           0      0.00%     99.97% |           0      0.00%     99.97% |           5      0.03%    100.00%
+system.ruby.LD.miss_latency_hist::total         16160                      
+system.ruby.ST.latency_hist::bucket_size          128                      
+system.ruby.ST.latency_hist::max_bucket          1279                      
+system.ruby.ST.latency_hist::samples            10412                      
+system.ruby.ST.latency_hist::mean            8.839992                      
+system.ruby.ST.latency_hist::gmean           1.186243                      
+system.ruby.ST.latency_hist::stdev          45.390081                      
+system.ruby.ST.latency_hist              |       10090     96.91%     96.91% |         254      2.44%     99.35% |          62      0.60%     99.94% |           0      0.00%     99.94% |           1      0.01%     99.95% |           4      0.04%     99.99% |           1      0.01%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.latency_hist::total              10412                      
+system.ruby.ST.hit_latency_hist::bucket_size          128                      
+system.ruby.ST.hit_latency_hist::max_bucket         1279                      
+system.ruby.ST.hit_latency_hist::samples          322                      
+system.ruby.ST.hit_latency_hist::mean      254.509317                      
+system.ruby.ST.hit_latency_hist::gmean     250.282441                      
+system.ruby.ST.hit_latency_hist::stdev      65.931487                      
+system.ruby.ST.hit_latency_hist          |           0      0.00%      0.00% |         254     78.88%     78.88% |          62     19.25%     98.14% |           0      0.00%     98.14% |           1      0.31%     98.45% |           4      1.24%     99.69% |           1      0.31%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.hit_latency_hist::total            322                      
+system.ruby.ST.miss_latency_hist::bucket_size            1                      
+system.ruby.ST.miss_latency_hist::max_bucket            9                      
+system.ruby.ST.miss_latency_hist::samples        10090                      
+system.ruby.ST.miss_latency_hist::mean              1                      
+system.ruby.ST.miss_latency_hist::gmean             1                      
+system.ruby.ST.miss_latency_hist         |           0      0.00%      0.00% |       10090    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.miss_latency_hist::total         10090                      
+system.ruby.IFETCH.latency_hist::bucket_size          128                      
+system.ruby.IFETCH.latency_hist::max_bucket         1279                      
+system.ruby.IFETCH.latency_hist::samples        87095                      
+system.ruby.IFETCH.latency_hist::mean        4.017395                      
+system.ruby.IFETCH.latency_hist::gmean       1.069735                      
+system.ruby.IFETCH.latency_hist::stdev      28.134930                      
+system.ruby.IFETCH.latency_hist          |       86061     98.81%     98.81% |         790      0.91%     99.72% |         224      0.26%     99.98% |           3      0.00%     99.98% |           7      0.01%     99.99% |           9      0.01%    100.00% |           1      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.latency_hist::total          87095                      
+system.ruby.IFETCH.hit_latency_hist::bucket_size          128                      
+system.ruby.IFETCH.hit_latency_hist::max_bucket         1279                      
+system.ruby.IFETCH.hit_latency_hist::samples         1034                      
+system.ruby.IFETCH.hit_latency_hist::mean   254.218569                      
+system.ruby.IFETCH.hit_latency_hist::gmean   250.716467                      
+system.ruby.IFETCH.hit_latency_hist::stdev    57.514968                      
+system.ruby.IFETCH.hit_latency_hist      |           0      0.00%      0.00% |         790     76.40%     76.40% |         224     21.66%     98.07% |           3      0.29%     98.36% |           7      0.68%     99.03% |           9      0.87%     99.90% |           1      0.10%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.hit_latency_hist::total         1034                      
+system.ruby.IFETCH.miss_latency_hist::bucket_size            2                      
+system.ruby.IFETCH.miss_latency_hist::max_bucket           19                      
+system.ruby.IFETCH.miss_latency_hist::samples        86061                      
+system.ruby.IFETCH.miss_latency_hist::mean     1.011294                      
+system.ruby.IFETCH.miss_latency_hist::gmean     1.001849                      
+system.ruby.IFETCH.miss_latency_hist::stdev     0.450747                      
+system.ruby.IFETCH.miss_latency_hist     |       86007     99.94%     99.94% |           0      0.00%     99.94% |           0      0.00%     99.94% |           0      0.00%     99.94% |           0      0.00%     99.94% |           0      0.00%     99.94% |           0      0.00%     99.94% |           0      0.00%     99.94% |           0      0.00%     99.94% |          54      0.06%    100.00%
+system.ruby.IFETCH.miss_latency_hist::total        86061                      
+system.ruby.RMW_Read.latency_hist::bucket_size           32                      
+system.ruby.RMW_Read.latency_hist::max_bucket          319                      
+system.ruby.RMW_Read.latency_hist::samples          341                      
+system.ruby.RMW_Read.latency_hist::mean      4.114370                      
+system.ruby.RMW_Read.latency_hist::gmean     1.067644                      
+system.ruby.RMW_Read.latency_hist::stdev    28.783090                      
+system.ruby.RMW_Read.latency_hist        |         337     98.83%     98.83% |           0      0.00%     98.83% |           0      0.00%     98.83% |           0      0.00%     98.83% |           0      0.00%     98.83% |           0      0.00%     98.83% |           0      0.00%     98.83% |           2      0.59%     99.41% |           0      0.00%     99.41% |           2      0.59%    100.00%
+system.ruby.RMW_Read.latency_hist::total          341                      
+system.ruby.RMW_Read.hit_latency_hist::bucket_size           32                      
+system.ruby.RMW_Read.hit_latency_hist::max_bucket          319                      
+system.ruby.RMW_Read.hit_latency_hist::samples            4                      
+system.ruby.RMW_Read.hit_latency_hist::mean   266.500000                      
+system.ruby.RMW_Read.hit_latency_hist::gmean   265.077347                      
+system.ruby.RMW_Read.hit_latency_hist::stdev    31.754265                      
+system.ruby.RMW_Read.hit_latency_hist    |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           2     50.00%     50.00% |           0      0.00%     50.00% |           2     50.00%    100.00%
+system.ruby.RMW_Read.hit_latency_hist::total            4                      
+system.ruby.RMW_Read.miss_latency_hist::bucket_size            1                      
+system.ruby.RMW_Read.miss_latency_hist::max_bucket            9                      
+system.ruby.RMW_Read.miss_latency_hist::samples          337                      
+system.ruby.RMW_Read.miss_latency_hist::mean            1                      
+system.ruby.RMW_Read.miss_latency_hist::gmean            1                      
+system.ruby.RMW_Read.miss_latency_hist   |           0      0.00%      0.00% |         337    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RMW_Read.miss_latency_hist::total          337                      
+system.ruby.Locked_RMW_Read.latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Read.latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Read.latency_hist::samples           10                      
+system.ruby.Locked_RMW_Read.latency_hist::mean            1                      
+system.ruby.Locked_RMW_Read.latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Read.latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Read.latency_hist::total           10                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::samples           10                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::mean            1                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Read.miss_latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Read.miss_latency_hist::total           10                      
+system.ruby.Locked_RMW_Write.latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Write.latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Write.latency_hist::samples           10                      
+system.ruby.Locked_RMW_Write.latency_hist::mean            1                      
+system.ruby.Locked_RMW_Write.latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Write.latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Write.latency_hist::total           10                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::samples           10                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::mean            1                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Write.miss_latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Write.miss_latency_hist::total           10                      
+system.ruby.L1Cache.miss_mach_latency_hist::bucket_size            1                      
+system.ruby.L1Cache.miss_mach_latency_hist::max_bucket            9                      
+system.ruby.L1Cache.miss_mach_latency_hist::samples       112609                      
+system.ruby.L1Cache.miss_mach_latency_hist::mean            1                      
+system.ruby.L1Cache.miss_mach_latency_hist::gmean            1                      
+system.ruby.L1Cache.miss_mach_latency_hist |           0      0.00%      0.00% |      112609    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.L1Cache.miss_mach_latency_hist::total       112609                      
+system.ruby.L2Cache.miss_mach_latency_hist::bucket_size            2                      
+system.ruby.L2Cache.miss_mach_latency_hist::max_bucket           19                      
+system.ruby.L2Cache.miss_mach_latency_hist::samples           59                      
+system.ruby.L2Cache.miss_mach_latency_hist::mean           19                      
+system.ruby.L2Cache.miss_mach_latency_hist::gmean    19.000000                      
+system.ruby.L2Cache.miss_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |          59    100.00%    100.00%
+system.ruby.L2Cache.miss_mach_latency_hist::total           59                      
+system.ruby.Directory.hit_mach_latency_hist::bucket_size          128                      
+system.ruby.Directory.hit_mach_latency_hist::max_bucket         1279                      
+system.ruby.Directory.hit_mach_latency_hist::samples         1535                      
+system.ruby.Directory.hit_mach_latency_hist::mean   255.015635                      
+system.ruby.Directory.hit_mach_latency_hist::gmean   251.519163                      
+system.ruby.Directory.hit_mach_latency_hist::stdev    57.825523                      
+system.ruby.Directory.hit_mach_latency_hist |           0      0.00%      0.00% |        1136     74.01%     74.01% |         372     24.23%     98.24% |           3      0.20%     98.44% |           8      0.52%     98.96% |          14      0.91%     99.87% |           2      0.13%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Directory.hit_mach_latency_hist::total         1535                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::samples        16155                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |       16155    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::total        16155                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::bucket_size            2                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::max_bucket           19                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::samples            5                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::mean           19                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::gmean    19.000000                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           5    100.00%    100.00%
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::total            5                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::bucket_size          128                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::max_bucket         1279                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::samples          175                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::mean   260.394286                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::gmean   258.339713                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::stdev    42.039376                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |          90     51.43%     51.43% |          84     48.00%     99.43% |           0      0.00%     99.43% |           0      0.00%     99.43% |           1      0.57%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.Directory.hit_type_mach_latency_hist::total          175                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::samples        10090                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |       10090    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::total        10090                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::bucket_size          128                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::max_bucket         1279                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::samples          322                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::mean   254.509317                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::gmean   250.282441                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::stdev    65.931487                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |         254     78.88%     78.88% |          62     19.25%     98.14% |           0      0.00%     98.14% |           1      0.31%     98.45% |           4      1.24%     99.69% |           1      0.31%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.Directory.hit_type_mach_latency_hist::total          322                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::samples        86007                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |       86007    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::total        86007                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::bucket_size            2                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::max_bucket           19                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::samples           54                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::mean           19                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::gmean    19.000000                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |          54    100.00%    100.00%
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::total           54                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::bucket_size          128                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::max_bucket         1279                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::samples         1034                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::mean   254.218569                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::gmean   250.716467                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::stdev    57.514968                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |         790     76.40%     76.40% |         224     21.66%     98.07% |           3      0.29%     98.36% |           7      0.68%     99.03% |           9      0.87%     99.90% |           1      0.10%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::total         1034                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::samples          337                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |         337    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::total          337                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::bucket_size           32                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::max_bucket          319                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::samples            4                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::mean   266.500000                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::gmean   265.077347                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::stdev    31.754265                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           2     50.00%     50.00% |           0      0.00%     50.00% |           2     50.00%    100.00%
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::total            4                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::samples           10                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::total           10                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::samples           10                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::total           10                      
+system.ruby.CorePair_Controller.C0_Load_L1miss          180      0.00%      0.00%
+system.ruby.CorePair_Controller.C0_Load_L1hit        16155      0.00%      0.00%
+system.ruby.CorePair_Controller.Ifetch0_L1hit        86007      0.00%      0.00%
+system.ruby.CorePair_Controller.Ifetch0_L1miss         1088      0.00%      0.00%
+system.ruby.CorePair_Controller.C0_Store_L1miss          325      0.00%      0.00%
+system.ruby.CorePair_Controller.C0_Store_L1hit        10448      0.00%      0.00%
+system.ruby.CorePair_Controller.NB_AckS          1034      0.00%      0.00%
+system.ruby.CorePair_Controller.NB_AckM           326      0.00%      0.00%
+system.ruby.CorePair_Controller.NB_AckE           175      0.00%      0.00%
+system.ruby.CorePair_Controller.L1I_Repl          589      0.00%      0.00%
+system.ruby.CorePair_Controller.L1D0_Repl           24      0.00%      0.00%
+system.ruby.CorePair_Controller.L2_to_L1D0            5      0.00%      0.00%
+system.ruby.CorePair_Controller.L2_to_L1I           54      0.00%      0.00%
+system.ruby.CorePair_Controller.PrbInvData           18      0.00%      0.00%
+system.ruby.CorePair_Controller.PrbShrData            7      0.00%      0.00%
+system.ruby.CorePair_Controller.I.C0_Load_L1miss          175      0.00%      0.00%
+system.ruby.CorePair_Controller.I.Ifetch0_L1miss         1034      0.00%      0.00%
+system.ruby.CorePair_Controller.I.C0_Store_L1miss          325      0.00%      0.00%
+system.ruby.CorePair_Controller.I.PrbInvData           17      0.00%      0.00%
+system.ruby.CorePair_Controller.I.PrbShrData            5      0.00%      0.00%
+system.ruby.CorePair_Controller.S.Ifetch0_L1hit        86007      0.00%      0.00%
+system.ruby.CorePair_Controller.S.Ifetch0_L1miss           54      0.00%      0.00%
+system.ruby.CorePair_Controller.S.L1I_Repl          589      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.C0_Load_L1miss            2      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.C0_Load_L1hit         3356      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.C0_Store_L1hit           46      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.L1D0_Repl           16      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.PrbShrData            1      0.00%      0.00%
+system.ruby.CorePair_Controller.O.C0_Load_L1hit            3      0.00%      0.00%
+system.ruby.CorePair_Controller.O.C0_Store_L1hit            1      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.C0_Load_L1miss            3      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.C0_Load_L1hit        12796      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.C0_Store_L1hit        10401      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.L1D0_Repl            8      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.PrbInvData            1      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.PrbShrData            1      0.00%      0.00%
+system.ruby.CorePair_Controller.I_M0.NB_AckM          325      0.00%      0.00%
+system.ruby.CorePair_Controller.I_E0S.NB_AckE          175      0.00%      0.00%
+system.ruby.CorePair_Controller.Si_F0.L2_to_L1I           54      0.00%      0.00%
+system.ruby.CorePair_Controller.O_M0.NB_AckM            1      0.00%      0.00%
+system.ruby.CorePair_Controller.S0.NB_AckS         1034      0.00%      0.00%
+system.ruby.CorePair_Controller.E0_F.L2_to_L1D0            2      0.00%      0.00%
+system.ruby.CorePair_Controller.M0_F.L2_to_L1D0            3      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlkS          1034      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlkM           326      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlk            182      0.00%      0.00%
+system.ruby.Directory_Controller.WriteThrough           16      0.00%      0.00%
+system.ruby.Directory_Controller.Atomic             3      0.00%      0.00%
+system.ruby.Directory_Controller.CPUPrbResp         1560      0.00%      0.00%
+system.ruby.Directory_Controller.ProbeAcksComplete         1560      0.00%      0.00%
+system.ruby.Directory_Controller.MemData         1560      0.00%      0.00%
+system.ruby.Directory_Controller.CoreUnblock         1541      0.00%      0.00%
+system.ruby.Directory_Controller.UnblockWriteThrough           18      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlkS         1034      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlkM          326      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlk          182      0.00%      0.00%
+system.ruby.Directory_Controller.U.WriteThrough           16      0.00%      0.00%
+system.ruby.Directory_Controller.U.Atomic            2      0.00%      0.00%
+system.ruby.Directory_Controller.BS_M.MemData         1034      0.00%      0.00%
+system.ruby.Directory_Controller.BM_M.MemData          326      0.00%      0.00%
+system.ruby.Directory_Controller.B_M.MemData          175      0.00%      0.00%
+system.ruby.Directory_Controller.BS_PM.CPUPrbResp         1034      0.00%      0.00%
+system.ruby.Directory_Controller.BS_PM.ProbeAcksComplete         1034      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.Atomic            1      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.CPUPrbResp          326      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.ProbeAcksComplete          326      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.MemData           18      0.00%      0.00%
+system.ruby.Directory_Controller.B_PM.CPUPrbResp          175      0.00%      0.00%
+system.ruby.Directory_Controller.B_PM.ProbeAcksComplete          175      0.00%      0.00%
+system.ruby.Directory_Controller.B_PM.MemData            7      0.00%      0.00%
+system.ruby.Directory_Controller.BM_Pm.CPUPrbResp           18      0.00%      0.00%
+system.ruby.Directory_Controller.BM_Pm.ProbeAcksComplete           18      0.00%      0.00%
+system.ruby.Directory_Controller.B_Pm.CPUPrbResp            7      0.00%      0.00%
+system.ruby.Directory_Controller.B_Pm.ProbeAcksComplete            7      0.00%      0.00%
+system.ruby.Directory_Controller.B.CoreUnblock         1541      0.00%      0.00%
+system.ruby.Directory_Controller.B.UnblockWriteThrough           18      0.00%      0.00%
+system.ruby.SQC_Controller.Fetch                   86      0.00%      0.00%
+system.ruby.SQC_Controller.Data                     5      0.00%      0.00%
+system.ruby.SQC_Controller.I.Fetch                  5      0.00%      0.00%
+system.ruby.SQC_Controller.I.Data                   5      0.00%      0.00%
+system.ruby.SQC_Controller.V.Fetch                 81      0.00%      0.00%
+system.ruby.TCC_Controller.RdBlk                    9      0.00%      0.00%
+system.ruby.TCC_Controller.WrVicBlk                16      0.00%      0.00%
+system.ruby.TCC_Controller.Atomic                   2      0.00%      0.00%
+system.ruby.TCC_Controller.AtomicDone               1      0.00%      0.00%
+system.ruby.TCC_Controller.Data                     9      0.00%      0.00%
+system.ruby.TCC_Controller.PrbInv                1535      0.00%      0.00%
+system.ruby.TCC_Controller.WBAck                   16      0.00%      0.00%
+system.ruby.TCC_Controller.V.PrbInv                 1      0.00%      0.00%
+system.ruby.TCC_Controller.I.RdBlk                  7      0.00%      0.00%
+system.ruby.TCC_Controller.I.WrVicBlk              16      0.00%      0.00%
+system.ruby.TCC_Controller.I.Atomic                 1      0.00%      0.00%
+system.ruby.TCC_Controller.I.PrbInv              1534      0.00%      0.00%
+system.ruby.TCC_Controller.I.WBAck                 16      0.00%      0.00%
+system.ruby.TCC_Controller.IV.RdBlk                 2      0.00%      0.00%
+system.ruby.TCC_Controller.IV.Data                  7      0.00%      0.00%
+system.ruby.TCC_Controller.A.Atomic                 1      0.00%      0.00%
+system.ruby.TCC_Controller.A.AtomicDone             1      0.00%      0.00%
+system.ruby.TCC_Controller.A.Data                   2      0.00%      0.00%
+system.ruby.TCP_Controller.Load          |           5     50.00%     50.00% |           5     50.00%    100.00%
+system.ruby.TCP_Controller.Load::total             10                      
+system.ruby.TCP_Controller.StoreThrough  |           8     50.00%     50.00% |           8     50.00%    100.00%
+system.ruby.TCP_Controller.StoreThrough::total           16                      
+system.ruby.TCP_Controller.Atomic        |           1     50.00%     50.00% |           1     50.00%    100.00%
+system.ruby.TCP_Controller.Atomic::total            2                      
+system.ruby.TCP_Controller.Flush         |         768     50.00%     50.00% |         768     50.00%    100.00%
+system.ruby.TCP_Controller.Flush::total          1536                      
+system.ruby.TCP_Controller.Evict         |         512     50.00%     50.00% |         512     50.00%    100.00%
+system.ruby.TCP_Controller.Evict::total          1024                      
+system.ruby.TCP_Controller.TCC_Ack       |           3     50.00%     50.00% |           3     50.00%    100.00%
+system.ruby.TCP_Controller.TCC_Ack::total            6                      
+system.ruby.TCP_Controller.TCC_AckWB     |           8     50.00%     50.00% |           8     50.00%    100.00%
+system.ruby.TCP_Controller.TCC_AckWB::total           16                      
+system.ruby.TCP_Controller.I.Load        |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.TCP_Controller.I.Load::total            4                      
+system.ruby.TCP_Controller.I.StoreThrough |           8     50.00%     50.00% |           8     50.00%    100.00%
+system.ruby.TCP_Controller.I.StoreThrough::total           16                      
+system.ruby.TCP_Controller.I.Atomic      |           1     50.00%     50.00% |           1     50.00%    100.00%
+system.ruby.TCP_Controller.I.Atomic::total            2                      
+system.ruby.TCP_Controller.I.Flush       |         766     50.00%     50.00% |         766     50.00%    100.00%
+system.ruby.TCP_Controller.I.Flush::total         1532                      
+system.ruby.TCP_Controller.I.Evict       |         510     50.00%     50.00% |         510     50.00%    100.00%
+system.ruby.TCP_Controller.I.Evict::total         1020                      
+system.ruby.TCP_Controller.I.TCC_Ack     |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.TCP_Controller.I.TCC_Ack::total            4                      
+system.ruby.TCP_Controller.I.TCC_AckWB   |           8     50.00%     50.00% |           8     50.00%    100.00%
+system.ruby.TCP_Controller.I.TCC_AckWB::total           16                      
+system.ruby.TCP_Controller.V.Load        |           3     50.00%     50.00% |           3     50.00%    100.00%
+system.ruby.TCP_Controller.V.Load::total            6                      
+system.ruby.TCP_Controller.V.Flush       |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.TCP_Controller.V.Flush::total            4                      
+system.ruby.TCP_Controller.V.Evict       |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.TCP_Controller.V.Evict::total            4                      
+system.ruby.TCP_Controller.A.TCC_Ack     |           1     50.00%     50.00% |           1     50.00%    100.00%
+system.ruby.TCP_Controller.A.TCC_Ack::total            2                      
+
+---------- End Simulation Statistics   ----------
diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/config.ini b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/config.ini
new file mode 100644
index 000000000..b3fabf81b
--- /dev/null
+++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/config.ini
@@ -0,0 +1,4089 @@
+[root]
+type=Root
+children=system
+eventq_index=0
+full_system=false
+sim_quantum=0
+time_sync_enable=false
+time_sync_period=100000000000
+time_sync_spin_threshold=100000000
+
+[system]
+type=System
+children=clk_domain cp_cntrl0 cpu0 cpu1 cpu2 dir_cntrl0 dispatcher_coalescer dispatcher_tlb dvfs_handler l1_coalescer0 l1_coalescer1 l1_tlb0 l1_tlb1 l2_coalescer l2_tlb l3_coalescer l3_tlb mem_ctrls piobus ruby sqc_cntrl0 sqc_coalescer sqc_tlb sys_port_proxy tcc_cntrl0 tcp_cntrl0 tcp_cntrl1 voltage_domain
+boot_osflags=a
+cache_line_size=64
+clk_domain=system.clk_domain
+eventq_index=0
+exit_on_work_items=false
+init_param=0
+kernel=
+kernel_addr_check=true
+load_addr_mask=1099511627775
+load_offset=0
+mem_mode=timing
+mem_ranges=0:536870911
+memories=system.mem_ctrls system.ruby.phys_mem
+mmap_using_noreserve=false
+multi_thread=false
+num_work_ids=16
+readfile=
+symbolfile=
+work_begin_ckpt_count=0
+work_begin_cpu_id_exit=-1
+work_begin_exit_count=0
+work_cpus_ckpt_count=0
+work_end_ckpt_count=0
+work_end_exit_count=0
+work_item_id=-1
+system_port=system.sys_port_proxy.slave[0]
+
+[system.clk_domain]
+type=SrcClockDomain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.voltage_domain
+
+[system.cp_cntrl0]
+type=CorePair_Controller
+children=L1D0cache L1D1cache L1Icache L2cache mandatoryQueue probeToCore requestFromCore responseFromCore responseToCore sequencer sequencer1 triggerQueue unblockFromCore
+L1D0cache=system.cp_cntrl0.L1D0cache
+L1D1cache=system.cp_cntrl0.L1D1cache
+L1Icache=system.cp_cntrl0.L1Icache
+L2cache=system.cp_cntrl0.L2cache
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+eventq_index=0
+issue_latency=120
+l2_hit_latency=18
+mandatoryQueue=system.cp_cntrl0.mandatoryQueue
+number_of_TBEs=256
+probeToCore=system.cp_cntrl0.probeToCore
+recycle_latency=10
+requestFromCore=system.cp_cntrl0.requestFromCore
+responseFromCore=system.cp_cntrl0.responseFromCore
+responseToCore=system.cp_cntrl0.responseToCore
+ruby_system=system.ruby
+send_evictions=true
+sequencer=system.cp_cntrl0.sequencer
+sequencer1=system.cp_cntrl0.sequencer1
+system=system
+transitions_per_cycle=32
+triggerQueue=system.cp_cntrl0.triggerQueue
+unblockFromCore=system.cp_cntrl0.unblockFromCore
+version=0
+
+[system.cp_cntrl0.L1D0cache]
+type=RubyCache
+children=replacement_policy
+assoc=2
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=2
+eventq_index=0
+is_icache=false
+replacement_policy=system.cp_cntrl0.L1D0cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=65536
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=2
+
+[system.cp_cntrl0.L1D0cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=2
+block_size=64
+eventq_index=0
+size=65536
+
+[system.cp_cntrl0.L1D1cache]
+type=RubyCache
+children=replacement_policy
+assoc=2
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=2
+eventq_index=0
+is_icache=false
+replacement_policy=system.cp_cntrl0.L1D1cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=65536
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=2
+
+[system.cp_cntrl0.L1D1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=2
+block_size=64
+eventq_index=0
+size=65536
+
+[system.cp_cntrl0.L1Icache]
+type=RubyCache
+children=replacement_policy
+assoc=2
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=2
+eventq_index=0
+is_icache=false
+replacement_policy=system.cp_cntrl0.L1Icache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=32768
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=2
+
+[system.cp_cntrl0.L1Icache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=2
+block_size=64
+eventq_index=0
+size=32768
+
+[system.cp_cntrl0.L2cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.cp_cntrl0.L2cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=2097152
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=16
+
+[system.cp_cntrl0.L2cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=2097152
+
+[system.cp_cntrl0.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.cp_cntrl0.probeToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[3]
+
+[system.cp_cntrl0.requestFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[2]
+
+[system.cp_cntrl0.responseFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[3]
+
+[system.cp_cntrl0.responseToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[4]
+
+[system.cp_cntrl0.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=0
+dcache=system.cp_cntrl0.L1D0cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.cp_cntrl0.L1Icache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=0
+master=system.cpu0.interrupts.pio system.cpu0.interrupts.int_slave
+mem_master_port=system.piobus.slave[0]
+slave=system.cpu0.icache_port system.cpu0.dcache_port system.cpu0.itb.walker.port system.cpu0.dtb.walker.port system.cpu0.interrupts.int_master
+
+[system.cp_cntrl0.sequencer1]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=1
+dcache=system.cp_cntrl0.L1D1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.cp_cntrl0.L1Icache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=1
+
+[system.cp_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.cp_cntrl0.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[4]
+
+[system.cpu0]
+type=TimingSimpleCPU
+children=apic_clk_domain clk_domain dtb interrupts isa itb tracer workload
+branchPred=Null
+checker=Null
+clk_domain=system.cpu0.clk_domain
+cpu_id=0
+do_checkpoint_insts=true
+do_quiesce=true
+do_statistics_insts=true
+dtb=system.cpu0.dtb
+eventq_index=0
+function_trace=false
+function_trace_start=0
+interrupts=system.cpu0.interrupts
+isa=system.cpu0.isa
+itb=system.cpu0.itb
+max_insts_all_threads=0
+max_insts_any_thread=0
+max_loads_all_threads=0
+max_loads_any_thread=0
+numThreads=1
+profile=0
+progress_interval=0
+simpoint_start_insts=
+socket_id=0
+switched_out=false
+system=system
+tracer=system.cpu0.tracer
+workload=system.cpu0.workload
+dcache_port=system.cp_cntrl0.sequencer.slave[1]
+icache_port=system.cp_cntrl0.sequencer.slave[0]
+
+[system.cpu0.apic_clk_domain]
+type=DerivedClockDomain
+clk_divider=16
+clk_domain=system.cpu0.clk_domain
+eventq_index=0
+
+[system.cpu0.clk_domain]
+type=SrcClockDomain
+clock=500
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.voltage_domain
+
+[system.cpu0.dtb]
+type=X86TLB
+children=walker
+eventq_index=0
+size=64
+walker=system.cpu0.dtb.walker
+
+[system.cpu0.dtb.walker]
+type=X86PagetableWalker
+clk_domain=system.cpu0.clk_domain
+eventq_index=0
+num_squash_per_cycle=4
+system=system
+port=system.cp_cntrl0.sequencer.slave[3]
+
+[system.cpu0.interrupts]
+type=X86LocalApic
+clk_domain=system.cpu0.apic_clk_domain
+eventq_index=0
+int_latency=1000
+pio_addr=2305843009213693952
+pio_latency=100000
+system=system
+int_master=system.cp_cntrl0.sequencer.slave[4]
+int_slave=system.cp_cntrl0.sequencer.master[1]
+pio=system.cp_cntrl0.sequencer.master[0]
+
+[system.cpu0.isa]
+type=X86ISA
+eventq_index=0
+
+[system.cpu0.itb]
+type=X86TLB
+children=walker
+eventq_index=0
+size=64
+walker=system.cpu0.itb.walker
+
+[system.cpu0.itb.walker]
+type=X86PagetableWalker
+clk_domain=system.cpu0.clk_domain
+eventq_index=0
+num_squash_per_cycle=4
+system=system
+port=system.cp_cntrl0.sequencer.slave[2]
+
+[system.cpu0.tracer]
+type=ExeTracer
+eventq_index=0
+
+[system.cpu0.workload]
+type=LiveProcess
+cmd=gpu-hello
+cwd=
+drivers=system.cpu2.cl_driver
+egid=100
+env=
+errout=cerr
+euid=100
+eventq_index=0
+executable=/dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello
+gid=100
+input=cin
+kvmInSE=false
+max_stack_size=67108864
+output=cout
+pid=100
+ppid=99
+simpoint=0
+system=system
+uid=100
+useArchPT=false
+
+[system.cpu1]
+type=Shader
+children=CUs0 CUs1 clk_domain
+CUs=system.cpu1.CUs0 system.cpu1.CUs1
+clk_domain=system.cpu1.clk_domain
+cpu_pointer=system.cpu0
+eventq_index=0
+globalmem=65536
+impl_kern_boundary_sync=true
+n_wf=8
+separate_acquire_release=false
+timing=true
+translation=false
+
+[system.cpu1.CUs0]
+type=ComputeUnit
+children=ldsBus localDataStore vector_register_file0 vector_register_file1 vector_register_file2 vector_register_file3 wavefronts00 wavefronts01 wavefronts02 wavefronts03 wavefronts04 wavefronts05 wavefronts06 wavefronts07 wavefronts08 wavefronts09 wavefronts10 wavefronts11 wavefronts12 wavefronts13 wavefronts14 wavefronts15 wavefronts16 wavefronts17 wavefronts18 wavefronts19 wavefronts20 wavefronts21 wavefronts22 wavefronts23 wavefronts24 wavefronts25 wavefronts26 wavefronts27 wavefronts28 wavefronts29 wavefronts30 wavefronts31
+clk_domain=system.cpu1.clk_domain
+coalescer_to_vrf_bus_width=32
+countPages=false
+cu_id=0
+debugSegFault=false
+dpbypass_pipe_length=4
+eventq_index=0
+execPolicy=OLDEST-FIRST
+functionalTLB=true
+global_mem_queue_size=256
+issue_period=4
+localDataStore=system.cpu1.CUs0.localDataStore
+localMemBarrier=false
+local_mem_queue_size=256
+mem_req_latency=9
+mem_resp_latency=9
+n_wf=8
+num_SIMDs=4
+num_global_mem_pipes=1
+num_shared_mem_pipes=1
+perLaneTLB=false
+prefetch_depth=0
+prefetch_prev_type=PF_PHASE
+prefetch_stride=1
+spbypass_pipe_length=4
+system=system
+vector_register_file=system.cpu1.CUs0.vector_register_file0 system.cpu1.CUs0.vector_register_file1 system.cpu1.CUs0.vector_register_file2 system.cpu1.CUs0.vector_register_file3
+vrf_to_coalescer_bus_width=32
+wavefronts=system.cpu1.CUs0.wavefronts00 system.cpu1.CUs0.wavefronts01 system.cpu1.CUs0.wavefronts02 system.cpu1.CUs0.wavefronts03 system.cpu1.CUs0.wavefronts04 system.cpu1.CUs0.wavefronts05 system.cpu1.CUs0.wavefronts06 system.cpu1.CUs0.wavefronts07 system.cpu1.CUs0.wavefronts08 system.cpu1.CUs0.wavefronts09 system.cpu1.CUs0.wavefronts10 system.cpu1.CUs0.wavefronts11 system.cpu1.CUs0.wavefronts12 system.cpu1.CUs0.wavefronts13 system.cpu1.CUs0.wavefronts14 system.cpu1.CUs0.wavefronts15 system.cpu1.CUs0.wavefronts16 system.cpu1.CUs0.wavefronts17 system.cpu1.CUs0.wavefronts18 system.cpu1.CUs0.wavefronts19 system.cpu1.CUs0.wavefronts20 system.cpu1.CUs0.wavefronts21 system.cpu1.CUs0.wavefronts22 system.cpu1.CUs0.wavefronts23 system.cpu1.CUs0.wavefronts24 system.cpu1.CUs0.wavefronts25 system.cpu1.CUs0.wavefronts26 system.cpu1.CUs0.wavefronts27 system.cpu1.CUs0.wavefronts28 system.cpu1.CUs0.wavefronts29 system.cpu1.CUs0.wavefronts30 system.cpu1.CUs0.wavefronts31
+wfSize=64
+xactCasMode=false
+ldsPort=system.cpu1.CUs0.ldsBus.slave
+memory_port=system.tcp_cntrl0.coalescer.slave[0] system.tcp_cntrl0.coalescer.slave[1] system.tcp_cntrl0.coalescer.slave[2] system.tcp_cntrl0.coalescer.slave[3] system.tcp_cntrl0.coalescer.slave[4] system.tcp_cntrl0.coalescer.slave[5] system.tcp_cntrl0.coalescer.slave[6] system.tcp_cntrl0.coalescer.slave[7] system.tcp_cntrl0.coalescer.slave[8] system.tcp_cntrl0.coalescer.slave[9] system.tcp_cntrl0.coalescer.slave[10] system.tcp_cntrl0.coalescer.slave[11] system.tcp_cntrl0.coalescer.slave[12] system.tcp_cntrl0.coalescer.slave[13] system.tcp_cntrl0.coalescer.slave[14] system.tcp_cntrl0.coalescer.slave[15] system.tcp_cntrl0.coalescer.slave[16] system.tcp_cntrl0.coalescer.slave[17] system.tcp_cntrl0.coalescer.slave[18] system.tcp_cntrl0.coalescer.slave[19] system.tcp_cntrl0.coalescer.slave[20] system.tcp_cntrl0.coalescer.slave[21] system.tcp_cntrl0.coalescer.slave[22] system.tcp_cntrl0.coalescer.slave[23] system.tcp_cntrl0.coalescer.slave[24] system.tcp_cntrl0.coalescer.slave[25] system.tcp_cntrl0.coalescer.slave[26] system.tcp_cntrl0.coalescer.slave[27] system.tcp_cntrl0.coalescer.slave[28] system.tcp_cntrl0.coalescer.slave[29] system.tcp_cntrl0.coalescer.slave[30] system.tcp_cntrl0.coalescer.slave[31] system.tcp_cntrl0.coalescer.slave[32] system.tcp_cntrl0.coalescer.slave[33] system.tcp_cntrl0.coalescer.slave[34] system.tcp_cntrl0.coalescer.slave[35] system.tcp_cntrl0.coalescer.slave[36] system.tcp_cntrl0.coalescer.slave[37] system.tcp_cntrl0.coalescer.slave[38] system.tcp_cntrl0.coalescer.slave[39] system.tcp_cntrl0.coalescer.slave[40] system.tcp_cntrl0.coalescer.slave[41] system.tcp_cntrl0.coalescer.slave[42] system.tcp_cntrl0.coalescer.slave[43] system.tcp_cntrl0.coalescer.slave[44] system.tcp_cntrl0.coalescer.slave[45] system.tcp_cntrl0.coalescer.slave[46] system.tcp_cntrl0.coalescer.slave[47] system.tcp_cntrl0.coalescer.slave[48] system.tcp_cntrl0.coalescer.slave[49] system.tcp_cntrl0.coalescer.slave[50] system.tcp_cntrl0.coalescer.slave[51] system.tcp_cntrl0.coalescer.slave[52] system.tcp_cntrl0.coalescer.slave[53] system.tcp_cntrl0.coalescer.slave[54] system.tcp_cntrl0.coalescer.slave[55] system.tcp_cntrl0.coalescer.slave[56] system.tcp_cntrl0.coalescer.slave[57] system.tcp_cntrl0.coalescer.slave[58] system.tcp_cntrl0.coalescer.slave[59] system.tcp_cntrl0.coalescer.slave[60] system.tcp_cntrl0.coalescer.slave[61] system.tcp_cntrl0.coalescer.slave[62] system.tcp_cntrl0.coalescer.slave[63]
+sqc_port=system.sqc_cntrl0.sequencer.slave[0]
+sqc_tlb_port=system.sqc_coalescer.slave[0]
+translation_port=system.l1_coalescer0.slave[0]
+
+[system.cpu1.CUs0.ldsBus]
+type=Bridge
+clk_domain=system.cpu1.clk_domain
+delay=0
+eventq_index=0
+ranges=0:18446744073709551615
+req_size=16
+resp_size=16
+master=system.cpu1.CUs0.localDataStore.cuPort
+slave=system.cpu1.CUs0.ldsPort
+
+[system.cpu1.CUs0.localDataStore]
+type=LdsState
+bankConflictPenalty=1
+banks=32
+clk_domain=system.cpu1.clk_domain
+eventq_index=0
+range=0:65535
+size=65536
+cuPort=system.cpu1.CUs0.ldsBus.master
+
+[system.cpu1.CUs0.vector_register_file0]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=0
+
+[system.cpu1.CUs0.vector_register_file1]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=1
+
+[system.cpu1.CUs0.vector_register_file2]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=2
+
+[system.cpu1.CUs0.vector_register_file3]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=3
+
+[system.cpu1.CUs0.wavefronts00]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=0
+
+[system.cpu1.CUs0.wavefronts01]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=1
+
+[system.cpu1.CUs0.wavefronts02]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=2
+
+[system.cpu1.CUs0.wavefronts03]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=3
+
+[system.cpu1.CUs0.wavefronts04]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=4
+
+[system.cpu1.CUs0.wavefronts05]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=5
+
+[system.cpu1.CUs0.wavefronts06]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=6
+
+[system.cpu1.CUs0.wavefronts07]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=7
+
+[system.cpu1.CUs0.wavefronts08]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=0
+
+[system.cpu1.CUs0.wavefronts09]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=1
+
+[system.cpu1.CUs0.wavefronts10]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=2
+
+[system.cpu1.CUs0.wavefronts11]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=3
+
+[system.cpu1.CUs0.wavefronts12]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=4
+
+[system.cpu1.CUs0.wavefronts13]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=5
+
+[system.cpu1.CUs0.wavefronts14]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=6
+
+[system.cpu1.CUs0.wavefronts15]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=7
+
+[system.cpu1.CUs0.wavefronts16]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=0
+
+[system.cpu1.CUs0.wavefronts17]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=1
+
+[system.cpu1.CUs0.wavefronts18]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=2
+
+[system.cpu1.CUs0.wavefronts19]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=3
+
+[system.cpu1.CUs0.wavefronts20]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=4
+
+[system.cpu1.CUs0.wavefronts21]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=5
+
+[system.cpu1.CUs0.wavefronts22]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=6
+
+[system.cpu1.CUs0.wavefronts23]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=7
+
+[system.cpu1.CUs0.wavefronts24]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=0
+
+[system.cpu1.CUs0.wavefronts25]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=1
+
+[system.cpu1.CUs0.wavefronts26]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=2
+
+[system.cpu1.CUs0.wavefronts27]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=3
+
+[system.cpu1.CUs0.wavefronts28]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=4
+
+[system.cpu1.CUs0.wavefronts29]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=5
+
+[system.cpu1.CUs0.wavefronts30]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=6
+
+[system.cpu1.CUs0.wavefronts31]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=7
+
+[system.cpu1.CUs1]
+type=ComputeUnit
+children=ldsBus localDataStore vector_register_file0 vector_register_file1 vector_register_file2 vector_register_file3 wavefronts00 wavefronts01 wavefronts02 wavefronts03 wavefronts04 wavefronts05 wavefronts06 wavefronts07 wavefronts08 wavefronts09 wavefronts10 wavefronts11 wavefronts12 wavefronts13 wavefronts14 wavefronts15 wavefronts16 wavefronts17 wavefronts18 wavefronts19 wavefronts20 wavefronts21 wavefronts22 wavefronts23 wavefronts24 wavefronts25 wavefronts26 wavefronts27 wavefronts28 wavefronts29 wavefronts30 wavefronts31
+clk_domain=system.cpu1.clk_domain
+coalescer_to_vrf_bus_width=32
+countPages=false
+cu_id=1
+debugSegFault=false
+dpbypass_pipe_length=4
+eventq_index=0
+execPolicy=OLDEST-FIRST
+functionalTLB=true
+global_mem_queue_size=256
+issue_period=4
+localDataStore=system.cpu1.CUs1.localDataStore
+localMemBarrier=false
+local_mem_queue_size=256
+mem_req_latency=9
+mem_resp_latency=9
+n_wf=8
+num_SIMDs=4
+num_global_mem_pipes=1
+num_shared_mem_pipes=1
+perLaneTLB=false
+prefetch_depth=0
+prefetch_prev_type=PF_PHASE
+prefetch_stride=1
+spbypass_pipe_length=4
+system=system
+vector_register_file=system.cpu1.CUs1.vector_register_file0 system.cpu1.CUs1.vector_register_file1 system.cpu1.CUs1.vector_register_file2 system.cpu1.CUs1.vector_register_file3
+vrf_to_coalescer_bus_width=32
+wavefronts=system.cpu1.CUs1.wavefronts00 system.cpu1.CUs1.wavefronts01 system.cpu1.CUs1.wavefronts02 system.cpu1.CUs1.wavefronts03 system.cpu1.CUs1.wavefronts04 system.cpu1.CUs1.wavefronts05 system.cpu1.CUs1.wavefronts06 system.cpu1.CUs1.wavefronts07 system.cpu1.CUs1.wavefronts08 system.cpu1.CUs1.wavefronts09 system.cpu1.CUs1.wavefronts10 system.cpu1.CUs1.wavefronts11 system.cpu1.CUs1.wavefronts12 system.cpu1.CUs1.wavefronts13 system.cpu1.CUs1.wavefronts14 system.cpu1.CUs1.wavefronts15 system.cpu1.CUs1.wavefronts16 system.cpu1.CUs1.wavefronts17 system.cpu1.CUs1.wavefronts18 system.cpu1.CUs1.wavefronts19 system.cpu1.CUs1.wavefronts20 system.cpu1.CUs1.wavefronts21 system.cpu1.CUs1.wavefronts22 system.cpu1.CUs1.wavefronts23 system.cpu1.CUs1.wavefronts24 system.cpu1.CUs1.wavefronts25 system.cpu1.CUs1.wavefronts26 system.cpu1.CUs1.wavefronts27 system.cpu1.CUs1.wavefronts28 system.cpu1.CUs1.wavefronts29 system.cpu1.CUs1.wavefronts30 system.cpu1.CUs1.wavefronts31
+wfSize=64
+xactCasMode=false
+ldsPort=system.cpu1.CUs1.ldsBus.slave
+memory_port=system.tcp_cntrl1.coalescer.slave[0] system.tcp_cntrl1.coalescer.slave[1] system.tcp_cntrl1.coalescer.slave[2] system.tcp_cntrl1.coalescer.slave[3] system.tcp_cntrl1.coalescer.slave[4] system.tcp_cntrl1.coalescer.slave[5] system.tcp_cntrl1.coalescer.slave[6] system.tcp_cntrl1.coalescer.slave[7] system.tcp_cntrl1.coalescer.slave[8] system.tcp_cntrl1.coalescer.slave[9] system.tcp_cntrl1.coalescer.slave[10] system.tcp_cntrl1.coalescer.slave[11] system.tcp_cntrl1.coalescer.slave[12] system.tcp_cntrl1.coalescer.slave[13] system.tcp_cntrl1.coalescer.slave[14] system.tcp_cntrl1.coalescer.slave[15] system.tcp_cntrl1.coalescer.slave[16] system.tcp_cntrl1.coalescer.slave[17] system.tcp_cntrl1.coalescer.slave[18] system.tcp_cntrl1.coalescer.slave[19] system.tcp_cntrl1.coalescer.slave[20] system.tcp_cntrl1.coalescer.slave[21] system.tcp_cntrl1.coalescer.slave[22] system.tcp_cntrl1.coalescer.slave[23] system.tcp_cntrl1.coalescer.slave[24] system.tcp_cntrl1.coalescer.slave[25] system.tcp_cntrl1.coalescer.slave[26] system.tcp_cntrl1.coalescer.slave[27] system.tcp_cntrl1.coalescer.slave[28] system.tcp_cntrl1.coalescer.slave[29] system.tcp_cntrl1.coalescer.slave[30] system.tcp_cntrl1.coalescer.slave[31] system.tcp_cntrl1.coalescer.slave[32] system.tcp_cntrl1.coalescer.slave[33] system.tcp_cntrl1.coalescer.slave[34] system.tcp_cntrl1.coalescer.slave[35] system.tcp_cntrl1.coalescer.slave[36] system.tcp_cntrl1.coalescer.slave[37] system.tcp_cntrl1.coalescer.slave[38] system.tcp_cntrl1.coalescer.slave[39] system.tcp_cntrl1.coalescer.slave[40] system.tcp_cntrl1.coalescer.slave[41] system.tcp_cntrl1.coalescer.slave[42] system.tcp_cntrl1.coalescer.slave[43] system.tcp_cntrl1.coalescer.slave[44] system.tcp_cntrl1.coalescer.slave[45] system.tcp_cntrl1.coalescer.slave[46] system.tcp_cntrl1.coalescer.slave[47] system.tcp_cntrl1.coalescer.slave[48] system.tcp_cntrl1.coalescer.slave[49] system.tcp_cntrl1.coalescer.slave[50] system.tcp_cntrl1.coalescer.slave[51] system.tcp_cntrl1.coalescer.slave[52] system.tcp_cntrl1.coalescer.slave[53] system.tcp_cntrl1.coalescer.slave[54] system.tcp_cntrl1.coalescer.slave[55] system.tcp_cntrl1.coalescer.slave[56] system.tcp_cntrl1.coalescer.slave[57] system.tcp_cntrl1.coalescer.slave[58] system.tcp_cntrl1.coalescer.slave[59] system.tcp_cntrl1.coalescer.slave[60] system.tcp_cntrl1.coalescer.slave[61] system.tcp_cntrl1.coalescer.slave[62] system.tcp_cntrl1.coalescer.slave[63]
+sqc_port=system.sqc_cntrl0.sequencer.slave[1]
+sqc_tlb_port=system.sqc_coalescer.slave[1]
+translation_port=system.l1_coalescer1.slave[0]
+
+[system.cpu1.CUs1.ldsBus]
+type=Bridge
+clk_domain=system.cpu1.clk_domain
+delay=0
+eventq_index=0
+ranges=0:18446744073709551615
+req_size=16
+resp_size=16
+master=system.cpu1.CUs1.localDataStore.cuPort
+slave=system.cpu1.CUs1.ldsPort
+
+[system.cpu1.CUs1.localDataStore]
+type=LdsState
+bankConflictPenalty=1
+banks=32
+clk_domain=system.cpu1.clk_domain
+eventq_index=0
+range=0:65535
+size=65536
+cuPort=system.cpu1.CUs1.ldsBus.master
+
+[system.cpu1.CUs1.vector_register_file0]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=0
+
+[system.cpu1.CUs1.vector_register_file1]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=1
+
+[system.cpu1.CUs1.vector_register_file2]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=2
+
+[system.cpu1.CUs1.vector_register_file3]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=3
+
+[system.cpu1.CUs1.wavefronts00]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=0
+
+[system.cpu1.CUs1.wavefronts01]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=1
+
+[system.cpu1.CUs1.wavefronts02]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=2
+
+[system.cpu1.CUs1.wavefronts03]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=3
+
+[system.cpu1.CUs1.wavefronts04]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=4
+
+[system.cpu1.CUs1.wavefronts05]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=5
+
+[system.cpu1.CUs1.wavefronts06]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=6
+
+[system.cpu1.CUs1.wavefronts07]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=7
+
+[system.cpu1.CUs1.wavefronts08]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=0
+
+[system.cpu1.CUs1.wavefronts09]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=1
+
+[system.cpu1.CUs1.wavefronts10]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=2
+
+[system.cpu1.CUs1.wavefronts11]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=3
+
+[system.cpu1.CUs1.wavefronts12]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=4
+
+[system.cpu1.CUs1.wavefronts13]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=5
+
+[system.cpu1.CUs1.wavefronts14]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=6
+
+[system.cpu1.CUs1.wavefronts15]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=7
+
+[system.cpu1.CUs1.wavefronts16]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=0
+
+[system.cpu1.CUs1.wavefronts17]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=1
+
+[system.cpu1.CUs1.wavefronts18]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=2
+
+[system.cpu1.CUs1.wavefronts19]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=3
+
+[system.cpu1.CUs1.wavefronts20]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=4
+
+[system.cpu1.CUs1.wavefronts21]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=5
+
+[system.cpu1.CUs1.wavefronts22]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=6
+
+[system.cpu1.CUs1.wavefronts23]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=7
+
+[system.cpu1.CUs1.wavefronts24]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=0
+
+[system.cpu1.CUs1.wavefronts25]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=1
+
+[system.cpu1.CUs1.wavefronts26]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=2
+
+[system.cpu1.CUs1.wavefronts27]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=3
+
+[system.cpu1.CUs1.wavefronts28]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=4
+
+[system.cpu1.CUs1.wavefronts29]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=5
+
+[system.cpu1.CUs1.wavefronts30]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=6
+
+[system.cpu1.CUs1.wavefronts31]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=7
+
+[system.cpu1.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.cpu1.clk_domain.voltage_domain
+
+[system.cpu1.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.cpu2]
+type=GpuDispatcher
+children=cl_driver
+cl_driver=system.cpu2.cl_driver
+clk_domain=system.clk_domain
+cpu=system.cpu0
+eventq_index=0
+pio_addr=8589934592
+pio_latency=1000
+shader_pointer=system.cpu1
+system=system
+dma=system.piobus.slave[1]
+pio=system.piobus.master[0]
+translation_port=system.dispatcher_coalescer.slave[0]
+
+[system.cpu2.cl_driver]
+type=ClDriver
+codefile=/dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm
+eventq_index=0
+filename=hsa
+
+[system.dir_cntrl0]
+type=Directory_Controller
+children=L3CacheMemory L3triggerQueue ProbeFilterMemory directory probeToCore requestFromCores responseFromCores responseFromMemory responseToCore triggerQueue unblockFromCores
+CAB_TCC=false
+L3CacheMemory=system.dir_cntrl0.L3CacheMemory
+L3triggerQueue=system.dir_cntrl0.L3triggerQueue
+ProbeFilterMemory=system.dir_cntrl0.ProbeFilterMemory
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+directory=system.dir_cntrl0.directory
+eventq_index=0
+inclusiveDir=true
+l3_hit_latency=15
+noTCCdir=true
+number_of_TBEs=2560
+probeToCore=system.dir_cntrl0.probeToCore
+recycle_latency=10
+requestFromCores=system.dir_cntrl0.requestFromCores
+responseFromCores=system.dir_cntrl0.responseFromCores
+responseFromMemory=system.dir_cntrl0.responseFromMemory
+responseToCore=system.dir_cntrl0.responseToCore
+response_latency=30
+ruby_system=system.ruby
+system=system
+to_memory_controller_latency=1
+transitions_per_cycle=32
+triggerQueue=system.dir_cntrl0.triggerQueue
+unblockFromCores=system.dir_cntrl0.unblockFromCores
+useL3OnWT=false
+version=0
+memory=system.mem_ctrls.port
+
+[system.dir_cntrl0.L3CacheMemory]
+type=RubyCache
+children=replacement_policy
+assoc=16
+block_size=0
+dataAccessLatency=20
+dataArrayBanks=16.0
+eventq_index=0
+is_icache=false
+replacement_policy=system.dir_cntrl0.L3CacheMemory.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=16777216
+start_index_bit=6
+tagAccessLatency=15
+tagArrayBanks=16.0
+
+[system.dir_cntrl0.L3CacheMemory.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=16
+block_size=64
+eventq_index=0
+size=16777216
+
+[system.dir_cntrl0.L3triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.dir_cntrl0.ProbeFilterMemory]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=64
+dataAccessLatency=1
+dataArrayBanks=256
+eventq_index=0
+is_icache=false
+replacement_policy=system.dir_cntrl0.ProbeFilterMemory.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=1048576
+start_index_bit=6
+tagAccessLatency=8
+tagArrayBanks=8
+
+[system.dir_cntrl0.ProbeFilterMemory.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=1048576
+
+[system.dir_cntrl0.directory]
+type=RubyDirectoryMemory
+eventq_index=0
+numa_high_bit=5
+size=536870912
+version=0
+
+[system.dir_cntrl0.probeToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[0]
+
+[system.dir_cntrl0.requestFromCores]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[0]
+
+[system.dir_cntrl0.responseFromCores]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[1]
+
+[system.dir_cntrl0.responseFromMemory]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.dir_cntrl0.responseToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[1]
+
+[system.dir_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.dir_cntrl0.unblockFromCores]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[2]
+
+[system.dispatcher_coalescer]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.dispatcher_coalescer.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.dispatcher_tlb.slave[0]
+slave=system.cpu2.translation_port
+
+[system.dispatcher_coalescer.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.dispatcher_coalescer.clk_domain.voltage_domain
+
+[system.dispatcher_coalescer.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.dispatcher_tlb]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.dispatcher_tlb.clk_domain
+eventq_index=0
+hitLatency=1
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=32
+master=system.l2_coalescer.slave[1]
+slave=system.dispatcher_coalescer.master[0]
+
+[system.dispatcher_tlb.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.dispatcher_tlb.clk_domain.voltage_domain
+
+[system.dispatcher_tlb.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.dvfs_handler]
+type=DVFSHandler
+domains=
+enable=false
+eventq_index=0
+sys_clk_domain=system.clk_domain
+transition_latency=100000000
+
+[system.l1_coalescer0]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.l1_coalescer0.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.l1_tlb0.slave[0]
+slave=system.cpu1.CUs0.translation_port[0]
+
+[system.l1_coalescer0.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l1_coalescer0.clk_domain.voltage_domain
+
+[system.l1_coalescer0.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l1_coalescer1]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.l1_coalescer1.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.l1_tlb1.slave[0]
+slave=system.cpu1.CUs1.translation_port[0]
+
+[system.l1_coalescer1.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l1_coalescer1.clk_domain.voltage_domain
+
+[system.l1_coalescer1.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l1_tlb0]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.l1_tlb0.clk_domain
+eventq_index=0
+hitLatency=1
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=32
+master=system.l2_coalescer.slave[2]
+slave=system.l1_coalescer0.master[0]
+
+[system.l1_tlb0.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l1_tlb0.clk_domain.voltage_domain
+
+[system.l1_tlb0.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l1_tlb1]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.l1_tlb1.clk_domain
+eventq_index=0
+hitLatency=1
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=32
+master=system.l2_coalescer.slave[3]
+slave=system.l1_coalescer1.master[0]
+
+[system.l1_tlb1.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l1_tlb1.clk_domain.voltage_domain
+
+[system.l1_tlb1.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l2_coalescer]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.l2_coalescer.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.l2_tlb.slave[0]
+slave=system.sqc_tlb.master[0] system.dispatcher_tlb.master[0] system.l1_tlb0.master[0] system.l1_tlb1.master[0]
+
+[system.l2_coalescer.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l2_coalescer.clk_domain.voltage_domain
+
+[system.l2_coalescer.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l2_tlb]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.l2_tlb.clk_domain
+eventq_index=0
+hitLatency=69
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=4096
+master=system.l3_coalescer.slave[0]
+slave=system.l2_coalescer.master[0]
+
+[system.l2_tlb.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l2_tlb.clk_domain.voltage_domain
+
+[system.l2_tlb.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l3_coalescer]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.l3_coalescer.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.l3_tlb.slave[0]
+slave=system.l2_tlb.master[0]
+
+[system.l3_coalescer.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l3_coalescer.clk_domain.voltage_domain
+
+[system.l3_coalescer.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l3_tlb]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.l3_tlb.clk_domain
+eventq_index=0
+hitLatency=150
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=8192
+slave=system.l3_coalescer.master[0]
+
+[system.l3_tlb.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l3_tlb.clk_domain.voltage_domain
+
+[system.l3_tlb.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.mem_ctrls]
+type=DRAMCtrl
+IDD0=0.075000
+IDD02=0.000000
+IDD2N=0.050000
+IDD2N2=0.000000
+IDD2P0=0.000000
+IDD2P02=0.000000
+IDD2P1=0.000000
+IDD2P12=0.000000
+IDD3N=0.057000
+IDD3N2=0.000000
+IDD3P0=0.000000
+IDD3P02=0.000000
+IDD3P1=0.000000
+IDD3P12=0.000000
+IDD4R=0.187000
+IDD4R2=0.000000
+IDD4W=0.165000
+IDD4W2=0.000000
+IDD5=0.220000
+IDD52=0.000000
+IDD6=0.000000
+IDD62=0.000000
+VDD=1.500000
+VDD2=0.000000
+activation_limit=4
+addr_mapping=RoRaBaCoCh
+bank_groups_per_rank=0
+banks_per_rank=8
+burst_length=8
+channels=1
+clk_domain=system.clk_domain
+conf_table_reported=true
+device_bus_width=8
+device_rowbuffer_size=1024
+device_size=536870912
+devices_per_rank=8
+dll=true
+eventq_index=0
+in_addr_map=true
+max_accesses_per_row=16
+mem_sched_policy=frfcfs
+min_writes_per_switch=16
+null=false
+page_policy=open_adaptive
+range=0:536870911
+ranks_per_channel=2
+read_buffer_size=32
+static_backend_latency=10000
+static_frontend_latency=10000
+tBURST=5000
+tCCD_L=0
+tCK=1250
+tCL=13750
+tCS=2500
+tRAS=35000
+tRCD=13750
+tREFI=7800000
+tRFC=260000
+tRP=13750
+tRRD=6000
+tRRD_L=0
+tRTP=7500
+tRTW=2500
+tWR=15000
+tWTR=7500
+tXAW=30000
+tXP=0
+tXPDLL=0
+tXS=0
+tXSDLL=0
+write_buffer_size=64
+write_high_thresh_perc=85
+write_low_thresh_perc=50
+port=system.dir_cntrl0.memory
+
+[system.piobus]
+type=NoncoherentXBar
+clk_domain=system.clk_domain
+eventq_index=0
+forward_latency=0
+frontend_latency=0
+response_latency=0
+use_default_range=false
+width=32
+master=system.cpu2.pio
+slave=system.cp_cntrl0.sequencer.mem_master_port system.cpu2.dma
+
+[system.ruby]
+type=RubySystem
+children=clk_domain network phys_mem
+access_backing_store=true
+all_instructions=false
+block_size_bytes=64
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+hot_lines=false
+memory_size_bits=48
+num_of_sequencers=5
+number_of_virtual_networks=10
+phys_mem=system.ruby.phys_mem
+randomization=false
+
+[system.ruby.clk_domain]
+type=SrcClockDomain
+clock=500
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.voltage_domain
+
+[system.ruby.network]
+type=SimpleNetwork
+children=ext_links0 ext_links1 ext_links2 ext_links3 ext_links4 ext_links5 int_link_buffers00 int_link_buffers01 int_link_buffers02 int_link_buffers03 int_link_buffers04 int_link_buffers05 int_link_buffers06 int_link_buffers07 int_link_buffers08 int_link_buffers09 int_link_buffers10 int_link_buffers11 int_link_buffers12 int_link_buffers13 int_link_buffers14 int_link_buffers15 int_link_buffers16 int_link_buffers17 int_link_buffers18 int_link_buffers19 int_link_buffers20 int_link_buffers21 int_link_buffers22 int_link_buffers23 int_link_buffers24 int_link_buffers25 int_link_buffers26 int_link_buffers27 int_link_buffers28 int_link_buffers29 int_link_buffers30 int_link_buffers31 int_link_buffers32 int_link_buffers33 int_link_buffers34 int_link_buffers35 int_link_buffers36 int_link_buffers37 int_link_buffers38 int_link_buffers39 int_links0 int_links1
+adaptive_routing=false
+buffer_size=0
+clk_domain=system.ruby.clk_domain
+control_msg_size=8
+endpoint_bandwidth=1000
+eventq_index=0
+ext_links=system.ruby.network.ext_links0 system.ruby.network.ext_links1 system.ruby.network.ext_links2 system.ruby.network.ext_links3 system.ruby.network.ext_links4 system.ruby.network.ext_links5
+int_link_buffers=system.ruby.network.int_link_buffers00 system.ruby.network.int_link_buffers01 system.ruby.network.int_link_buffers02 system.ruby.network.int_link_buffers03 system.ruby.network.int_link_buffers04 system.ruby.network.int_link_buffers05 system.ruby.network.int_link_buffers06 system.ruby.network.int_link_buffers07 system.ruby.network.int_link_buffers08 system.ruby.network.int_link_buffers09 system.ruby.network.int_link_buffers10 system.ruby.network.int_link_buffers11 system.ruby.network.int_link_buffers12 system.ruby.network.int_link_buffers13 system.ruby.network.int_link_buffers14 system.ruby.network.int_link_buffers15 system.ruby.network.int_link_buffers16 system.ruby.network.int_link_buffers17 system.ruby.network.int_link_buffers18 system.ruby.network.int_link_buffers19 system.ruby.network.int_link_buffers20 system.ruby.network.int_link_buffers21 system.ruby.network.int_link_buffers22 system.ruby.network.int_link_buffers23 system.ruby.network.int_link_buffers24 system.ruby.network.int_link_buffers25 system.ruby.network.int_link_buffers26 system.ruby.network.int_link_buffers27 system.ruby.network.int_link_buffers28 system.ruby.network.int_link_buffers29 system.ruby.network.int_link_buffers30 system.ruby.network.int_link_buffers31 system.ruby.network.int_link_buffers32 system.ruby.network.int_link_buffers33 system.ruby.network.int_link_buffers34 system.ruby.network.int_link_buffers35 system.ruby.network.int_link_buffers36 system.ruby.network.int_link_buffers37 system.ruby.network.int_link_buffers38 system.ruby.network.int_link_buffers39
+int_links=system.ruby.network.int_links0 system.ruby.network.int_links1
+netifs=
+number_of_virtual_networks=10
+routers=system.ruby.network.ext_links0.int_node system.ruby.network.ext_links1.int_node system.ruby.network.ext_links2.int_node
+ruby_system=system.ruby
+topology=Crossbar
+master=system.dir_cntrl0.requestFromCores.slave system.dir_cntrl0.responseFromCores.slave system.dir_cntrl0.unblockFromCores.slave system.cp_cntrl0.probeToCore.slave system.cp_cntrl0.responseToCore.slave system.tcp_cntrl0.probeToTCP.slave system.tcp_cntrl0.responseToTCP.slave system.tcp_cntrl1.probeToTCP.slave system.tcp_cntrl1.responseToTCP.slave system.sqc_cntrl0.probeToSQC.slave system.sqc_cntrl0.responseToSQC.slave system.tcc_cntrl0.requestFromTCP.slave system.tcc_cntrl0.probeFromNB.slave system.tcc_cntrl0.responseFromNB.slave
+slave=system.dir_cntrl0.probeToCore.master system.dir_cntrl0.responseToCore.master system.cp_cntrl0.requestFromCore.master system.cp_cntrl0.responseFromCore.master system.cp_cntrl0.unblockFromCore.master system.tcp_cntrl0.requestFromTCP.master system.tcp_cntrl0.responseFromTCP.master system.tcp_cntrl0.unblockFromCore.master system.tcp_cntrl1.requestFromTCP.master system.tcp_cntrl1.responseFromTCP.master system.tcp_cntrl1.unblockFromCore.master system.sqc_cntrl0.requestFromSQC.master system.tcc_cntrl0.responseToCore.master system.tcc_cntrl0.requestToNB.master system.tcc_cntrl0.responseToNB.master system.tcc_cntrl0.unblockToNB.master
+
+[system.ruby.network.ext_links0]
+type=SimpleExtLink
+children=int_node
+bandwidth_factor=32
+eventq_index=0
+ext_node=system.dir_cntrl0
+int_node=system.ruby.network.ext_links0.int_node
+latency=1
+link_id=0
+weight=1
+
+[system.ruby.network.ext_links0.int_node]
+type=Switch
+children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 port_buffers70 port_buffers71 port_buffers72 port_buffers73 port_buffers74 port_buffers75 port_buffers76 port_buffers77 port_buffers78 port_buffers79
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+port_buffers=system.ruby.network.ext_links0.int_node.port_buffers00 system.ruby.network.ext_links0.int_node.port_buffers01 system.ruby.network.ext_links0.int_node.port_buffers02 system.ruby.network.ext_links0.int_node.port_buffers03 system.ruby.network.ext_links0.int_node.port_buffers04 system.ruby.network.ext_links0.int_node.port_buffers05 system.ruby.network.ext_links0.int_node.port_buffers06 system.ruby.network.ext_links0.int_node.port_buffers07 system.ruby.network.ext_links0.int_node.port_buffers08 system.ruby.network.ext_links0.int_node.port_buffers09 system.ruby.network.ext_links0.int_node.port_buffers10 system.ruby.network.ext_links0.int_node.port_buffers11 system.ruby.network.ext_links0.int_node.port_buffers12 system.ruby.network.ext_links0.int_node.port_buffers13 system.ruby.network.ext_links0.int_node.port_buffers14 system.ruby.network.ext_links0.int_node.port_buffers15 system.ruby.network.ext_links0.int_node.port_buffers16 system.ruby.network.ext_links0.int_node.port_buffers17 system.ruby.network.ext_links0.int_node.port_buffers18 system.ruby.network.ext_links0.int_node.port_buffers19 system.ruby.network.ext_links0.int_node.port_buffers20 system.ruby.network.ext_links0.int_node.port_buffers21 system.ruby.network.ext_links0.int_node.port_buffers22 system.ruby.network.ext_links0.int_node.port_buffers23 system.ruby.network.ext_links0.int_node.port_buffers24 system.ruby.network.ext_links0.int_node.port_buffers25 system.ruby.network.ext_links0.int_node.port_buffers26 system.ruby.network.ext_links0.int_node.port_buffers27 system.ruby.network.ext_links0.int_node.port_buffers28 system.ruby.network.ext_links0.int_node.port_buffers29 system.ruby.network.ext_links0.int_node.port_buffers30 system.ruby.network.ext_links0.int_node.port_buffers31 system.ruby.network.ext_links0.int_node.port_buffers32 system.ruby.network.ext_links0.int_node.port_buffers33 system.ruby.network.ext_links0.int_node.port_buffers34 system.ruby.network.ext_links0.int_node.port_buffers35 system.ruby.network.ext_links0.int_node.port_buffers36 system.ruby.network.ext_links0.int_node.port_buffers37 system.ruby.network.ext_links0.int_node.port_buffers38 system.ruby.network.ext_links0.int_node.port_buffers39 system.ruby.network.ext_links0.int_node.port_buffers40 system.ruby.network.ext_links0.int_node.port_buffers41 system.ruby.network.ext_links0.int_node.port_buffers42 system.ruby.network.ext_links0.int_node.port_buffers43 system.ruby.network.ext_links0.int_node.port_buffers44 system.ruby.network.ext_links0.int_node.port_buffers45 system.ruby.network.ext_links0.int_node.port_buffers46 system.ruby.network.ext_links0.int_node.port_buffers47 system.ruby.network.ext_links0.int_node.port_buffers48 system.ruby.network.ext_links0.int_node.port_buffers49 system.ruby.network.ext_links0.int_node.port_buffers50 system.ruby.network.ext_links0.int_node.port_buffers51 system.ruby.network.ext_links0.int_node.port_buffers52 system.ruby.network.ext_links0.int_node.port_buffers53 system.ruby.network.ext_links0.int_node.port_buffers54 system.ruby.network.ext_links0.int_node.port_buffers55 system.ruby.network.ext_links0.int_node.port_buffers56 system.ruby.network.ext_links0.int_node.port_buffers57 system.ruby.network.ext_links0.int_node.port_buffers58 system.ruby.network.ext_links0.int_node.port_buffers59 system.ruby.network.ext_links0.int_node.port_buffers60 system.ruby.network.ext_links0.int_node.port_buffers61 system.ruby.network.ext_links0.int_node.port_buffers62 system.ruby.network.ext_links0.int_node.port_buffers63 system.ruby.network.ext_links0.int_node.port_buffers64 system.ruby.network.ext_links0.int_node.port_buffers65 system.ruby.network.ext_links0.int_node.port_buffers66 system.ruby.network.ext_links0.int_node.port_buffers67 system.ruby.network.ext_links0.int_node.port_buffers68 system.ruby.network.ext_links0.int_node.port_buffers69 system.ruby.network.ext_links0.int_node.port_buffers70 system.ruby.network.ext_links0.int_node.port_buffers71 system.ruby.network.ext_links0.int_node.port_buffers72 system.ruby.network.ext_links0.int_node.port_buffers73 system.ruby.network.ext_links0.int_node.port_buffers74 system.ruby.network.ext_links0.int_node.port_buffers75 system.ruby.network.ext_links0.int_node.port_buffers76 system.ruby.network.ext_links0.int_node.port_buffers77 system.ruby.network.ext_links0.int_node.port_buffers78 system.ruby.network.ext_links0.int_node.port_buffers79
+router_id=0
+virt_nets=10
+
+[system.ruby.network.ext_links0.int_node.port_buffers00]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers01]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers02]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers03]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers04]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers05]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers06]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers07]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers08]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers09]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers10]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers11]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers12]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers13]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers14]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers15]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers16]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers17]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers18]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers19]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers20]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers21]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers22]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers23]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers24]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers25]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers26]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers27]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers28]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers29]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers30]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers31]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers32]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers33]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers34]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers35]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers36]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers37]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers38]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers39]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers40]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers41]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers42]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers43]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers44]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers45]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers46]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers47]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers48]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers49]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers50]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers51]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers52]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers53]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers54]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers55]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers56]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers57]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers58]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers59]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers60]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers61]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers62]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers63]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers64]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers65]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers66]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers67]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers68]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers69]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers70]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers71]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers72]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers73]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers74]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers75]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers76]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers77]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers78]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers79]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1]
+type=SimpleExtLink
+children=int_node
+bandwidth_factor=32
+eventq_index=0
+ext_node=system.cp_cntrl0
+int_node=system.ruby.network.ext_links1.int_node
+latency=1
+link_id=1
+weight=1
+
+[system.ruby.network.ext_links1.int_node]
+type=Switch
+children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+port_buffers=system.ruby.network.ext_links1.int_node.port_buffers00 system.ruby.network.ext_links1.int_node.port_buffers01 system.ruby.network.ext_links1.int_node.port_buffers02 system.ruby.network.ext_links1.int_node.port_buffers03 system.ruby.network.ext_links1.int_node.port_buffers04 system.ruby.network.ext_links1.int_node.port_buffers05 system.ruby.network.ext_links1.int_node.port_buffers06 system.ruby.network.ext_links1.int_node.port_buffers07 system.ruby.network.ext_links1.int_node.port_buffers08 system.ruby.network.ext_links1.int_node.port_buffers09 system.ruby.network.ext_links1.int_node.port_buffers10 system.ruby.network.ext_links1.int_node.port_buffers11 system.ruby.network.ext_links1.int_node.port_buffers12 system.ruby.network.ext_links1.int_node.port_buffers13 system.ruby.network.ext_links1.int_node.port_buffers14 system.ruby.network.ext_links1.int_node.port_buffers15 system.ruby.network.ext_links1.int_node.port_buffers16 system.ruby.network.ext_links1.int_node.port_buffers17 system.ruby.network.ext_links1.int_node.port_buffers18 system.ruby.network.ext_links1.int_node.port_buffers19 system.ruby.network.ext_links1.int_node.port_buffers20 system.ruby.network.ext_links1.int_node.port_buffers21 system.ruby.network.ext_links1.int_node.port_buffers22 system.ruby.network.ext_links1.int_node.port_buffers23 system.ruby.network.ext_links1.int_node.port_buffers24 system.ruby.network.ext_links1.int_node.port_buffers25 system.ruby.network.ext_links1.int_node.port_buffers26 system.ruby.network.ext_links1.int_node.port_buffers27 system.ruby.network.ext_links1.int_node.port_buffers28 system.ruby.network.ext_links1.int_node.port_buffers29 system.ruby.network.ext_links1.int_node.port_buffers30 system.ruby.network.ext_links1.int_node.port_buffers31 system.ruby.network.ext_links1.int_node.port_buffers32 system.ruby.network.ext_links1.int_node.port_buffers33 system.ruby.network.ext_links1.int_node.port_buffers34 system.ruby.network.ext_links1.int_node.port_buffers35 system.ruby.network.ext_links1.int_node.port_buffers36 system.ruby.network.ext_links1.int_node.port_buffers37 system.ruby.network.ext_links1.int_node.port_buffers38 system.ruby.network.ext_links1.int_node.port_buffers39 system.ruby.network.ext_links1.int_node.port_buffers40 system.ruby.network.ext_links1.int_node.port_buffers41 system.ruby.network.ext_links1.int_node.port_buffers42 system.ruby.network.ext_links1.int_node.port_buffers43 system.ruby.network.ext_links1.int_node.port_buffers44 system.ruby.network.ext_links1.int_node.port_buffers45 system.ruby.network.ext_links1.int_node.port_buffers46 system.ruby.network.ext_links1.int_node.port_buffers47 system.ruby.network.ext_links1.int_node.port_buffers48 system.ruby.network.ext_links1.int_node.port_buffers49 system.ruby.network.ext_links1.int_node.port_buffers50 system.ruby.network.ext_links1.int_node.port_buffers51 system.ruby.network.ext_links1.int_node.port_buffers52 system.ruby.network.ext_links1.int_node.port_buffers53 system.ruby.network.ext_links1.int_node.port_buffers54 system.ruby.network.ext_links1.int_node.port_buffers55 system.ruby.network.ext_links1.int_node.port_buffers56 system.ruby.network.ext_links1.int_node.port_buffers57 system.ruby.network.ext_links1.int_node.port_buffers58 system.ruby.network.ext_links1.int_node.port_buffers59 system.ruby.network.ext_links1.int_node.port_buffers60 system.ruby.network.ext_links1.int_node.port_buffers61 system.ruby.network.ext_links1.int_node.port_buffers62 system.ruby.network.ext_links1.int_node.port_buffers63 system.ruby.network.ext_links1.int_node.port_buffers64 system.ruby.network.ext_links1.int_node.port_buffers65 system.ruby.network.ext_links1.int_node.port_buffers66 system.ruby.network.ext_links1.int_node.port_buffers67 system.ruby.network.ext_links1.int_node.port_buffers68 system.ruby.network.ext_links1.int_node.port_buffers69
+router_id=1
+virt_nets=10
+
+[system.ruby.network.ext_links1.int_node.port_buffers00]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers01]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers02]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers03]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers04]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers05]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers06]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers07]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers08]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers09]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers10]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers11]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers12]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers13]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers14]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers15]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers16]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers17]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers18]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers19]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers20]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers21]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers22]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers23]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers24]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers25]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers26]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers27]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers28]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers29]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers30]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers31]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers32]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers33]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers34]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers35]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers36]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers37]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers38]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers39]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers40]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers41]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers42]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers43]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers44]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers45]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers46]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers47]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers48]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers49]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers50]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers51]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers52]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers53]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers54]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers55]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers56]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers57]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers58]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers59]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers60]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers61]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers62]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers63]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers64]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers65]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers66]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers67]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers68]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1.int_node.port_buffers69]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2]
+type=SimpleExtLink
+children=int_node
+bandwidth_factor=32
+eventq_index=0
+ext_node=system.tcp_cntrl0
+int_node=system.ruby.network.ext_links2.int_node
+latency=1
+link_id=2
+weight=1
+
+[system.ruby.network.ext_links2.int_node]
+type=Switch
+children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+port_buffers=system.ruby.network.ext_links2.int_node.port_buffers00 system.ruby.network.ext_links2.int_node.port_buffers01 system.ruby.network.ext_links2.int_node.port_buffers02 system.ruby.network.ext_links2.int_node.port_buffers03 system.ruby.network.ext_links2.int_node.port_buffers04 system.ruby.network.ext_links2.int_node.port_buffers05 system.ruby.network.ext_links2.int_node.port_buffers06 system.ruby.network.ext_links2.int_node.port_buffers07 system.ruby.network.ext_links2.int_node.port_buffers08 system.ruby.network.ext_links2.int_node.port_buffers09 system.ruby.network.ext_links2.int_node.port_buffers10 system.ruby.network.ext_links2.int_node.port_buffers11 system.ruby.network.ext_links2.int_node.port_buffers12 system.ruby.network.ext_links2.int_node.port_buffers13 system.ruby.network.ext_links2.int_node.port_buffers14 system.ruby.network.ext_links2.int_node.port_buffers15 system.ruby.network.ext_links2.int_node.port_buffers16 system.ruby.network.ext_links2.int_node.port_buffers17 system.ruby.network.ext_links2.int_node.port_buffers18 system.ruby.network.ext_links2.int_node.port_buffers19 system.ruby.network.ext_links2.int_node.port_buffers20 system.ruby.network.ext_links2.int_node.port_buffers21 system.ruby.network.ext_links2.int_node.port_buffers22 system.ruby.network.ext_links2.int_node.port_buffers23 system.ruby.network.ext_links2.int_node.port_buffers24 system.ruby.network.ext_links2.int_node.port_buffers25 system.ruby.network.ext_links2.int_node.port_buffers26 system.ruby.network.ext_links2.int_node.port_buffers27 system.ruby.network.ext_links2.int_node.port_buffers28 system.ruby.network.ext_links2.int_node.port_buffers29 system.ruby.network.ext_links2.int_node.port_buffers30 system.ruby.network.ext_links2.int_node.port_buffers31 system.ruby.network.ext_links2.int_node.port_buffers32 system.ruby.network.ext_links2.int_node.port_buffers33 system.ruby.network.ext_links2.int_node.port_buffers34 system.ruby.network.ext_links2.int_node.port_buffers35 system.ruby.network.ext_links2.int_node.port_buffers36 system.ruby.network.ext_links2.int_node.port_buffers37 system.ruby.network.ext_links2.int_node.port_buffers38 system.ruby.network.ext_links2.int_node.port_buffers39 system.ruby.network.ext_links2.int_node.port_buffers40 system.ruby.network.ext_links2.int_node.port_buffers41 system.ruby.network.ext_links2.int_node.port_buffers42 system.ruby.network.ext_links2.int_node.port_buffers43 system.ruby.network.ext_links2.int_node.port_buffers44 system.ruby.network.ext_links2.int_node.port_buffers45 system.ruby.network.ext_links2.int_node.port_buffers46 system.ruby.network.ext_links2.int_node.port_buffers47 system.ruby.network.ext_links2.int_node.port_buffers48 system.ruby.network.ext_links2.int_node.port_buffers49 system.ruby.network.ext_links2.int_node.port_buffers50 system.ruby.network.ext_links2.int_node.port_buffers51 system.ruby.network.ext_links2.int_node.port_buffers52 system.ruby.network.ext_links2.int_node.port_buffers53 system.ruby.network.ext_links2.int_node.port_buffers54 system.ruby.network.ext_links2.int_node.port_buffers55 system.ruby.network.ext_links2.int_node.port_buffers56 system.ruby.network.ext_links2.int_node.port_buffers57 system.ruby.network.ext_links2.int_node.port_buffers58 system.ruby.network.ext_links2.int_node.port_buffers59 system.ruby.network.ext_links2.int_node.port_buffers60 system.ruby.network.ext_links2.int_node.port_buffers61 system.ruby.network.ext_links2.int_node.port_buffers62 system.ruby.network.ext_links2.int_node.port_buffers63 system.ruby.network.ext_links2.int_node.port_buffers64 system.ruby.network.ext_links2.int_node.port_buffers65 system.ruby.network.ext_links2.int_node.port_buffers66 system.ruby.network.ext_links2.int_node.port_buffers67 system.ruby.network.ext_links2.int_node.port_buffers68 system.ruby.network.ext_links2.int_node.port_buffers69
+router_id=2
+virt_nets=10
+
+[system.ruby.network.ext_links2.int_node.port_buffers00]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers01]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers02]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers03]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers04]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers05]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers06]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers07]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers08]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers09]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers10]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers11]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers12]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers13]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers14]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers15]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers16]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers17]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers18]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers19]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers20]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers21]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers22]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers23]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers24]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers25]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers26]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers27]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers28]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers29]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers30]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers31]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers32]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers33]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers34]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers35]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers36]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers37]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers38]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers39]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers40]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers41]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers42]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers43]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers44]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers45]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers46]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers47]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers48]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers49]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers50]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers51]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers52]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers53]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers54]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers55]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers56]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers57]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers58]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers59]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers60]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers61]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers62]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers63]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers64]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers65]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers66]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers67]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers68]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers69]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links3]
+type=SimpleExtLink
+bandwidth_factor=32
+eventq_index=0
+ext_node=system.tcp_cntrl1
+int_node=system.ruby.network.ext_links2.int_node
+latency=1
+link_id=3
+weight=1
+
+[system.ruby.network.ext_links4]
+type=SimpleExtLink
+bandwidth_factor=32
+eventq_index=0
+ext_node=system.sqc_cntrl0
+int_node=system.ruby.network.ext_links2.int_node
+latency=1
+link_id=4
+weight=1
+
+[system.ruby.network.ext_links5]
+type=SimpleExtLink
+bandwidth_factor=32
+eventq_index=0
+ext_node=system.tcc_cntrl0
+int_node=system.ruby.network.ext_links2.int_node
+latency=1
+link_id=5
+weight=1
+
+[system.ruby.network.int_link_buffers00]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers01]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers02]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers03]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers04]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers05]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers06]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers07]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers08]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers09]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers10]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers11]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers12]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers13]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers14]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers15]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers16]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers17]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers18]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers19]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers20]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers21]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers22]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers23]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers24]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers25]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers26]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers27]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers28]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers29]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers30]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers31]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers32]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers33]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers34]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers35]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers36]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers37]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers38]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers39]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_links0]
+type=SimpleIntLink
+bandwidth_factor=32
+eventq_index=0
+latency=1
+link_id=0
+node_a=system.ruby.network.ext_links0.int_node
+node_b=system.ruby.network.ext_links1.int_node
+weight=1
+
+[system.ruby.network.int_links1]
+type=SimpleIntLink
+bandwidth_factor=32
+eventq_index=0
+latency=1
+link_id=1
+node_a=system.ruby.network.ext_links0.int_node
+node_b=system.ruby.network.ext_links2.int_node
+weight=1
+
+[system.ruby.phys_mem]
+type=SimpleMemory
+bandwidth=73.000000
+clk_domain=system.ruby.clk_domain
+conf_table_reported=true
+eventq_index=0
+in_addr_map=false
+latency=30000
+latency_var=0
+null=false
+range=0:536870911
+
+[system.sqc_cntrl0]
+type=SQC_Controller
+children=L1cache mandatoryQueue probeToSQC requestFromSQC responseToSQC sequencer
+L1cache=system.sqc_cntrl0.L1cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+eventq_index=0
+issue_latency=80
+l2_hit_latency=18
+mandatoryQueue=system.sqc_cntrl0.mandatoryQueue
+number_of_TBEs=256
+probeToSQC=system.sqc_cntrl0.probeToSQC
+recycle_latency=10
+requestFromSQC=system.sqc_cntrl0.requestFromSQC
+responseToSQC=system.sqc_cntrl0.responseToSQC
+ruby_system=system.ruby
+sequencer=system.sqc_cntrl0.sequencer
+system=system
+transitions_per_cycle=32
+version=0
+
+[system.sqc_cntrl0.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=8
+eventq_index=0
+is_icache=false
+replacement_policy=system.sqc_cntrl0.L1cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=32768
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=8
+
+[system.sqc_cntrl0.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=32768
+
+[system.sqc_cntrl0.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.sqc_cntrl0.probeToSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[9]
+
+[system.sqc_cntrl0.requestFromSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[11]
+
+[system.sqc_cntrl0.responseToSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[10]
+
+[system.sqc_cntrl0.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.sqc_cntrl0.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.sqc_cntrl0.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=false
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=6
+slave=system.cpu1.CUs0.sqc_port system.cpu1.CUs1.sqc_port
+
+[system.sqc_coalescer]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.sqc_coalescer.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.sqc_tlb.slave[0]
+slave=system.cpu1.CUs0.sqc_tlb_port system.cpu1.CUs1.sqc_tlb_port
+
+[system.sqc_coalescer.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.sqc_coalescer.clk_domain.voltage_domain
+
+[system.sqc_coalescer.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.sqc_tlb]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.sqc_tlb.clk_domain
+eventq_index=0
+hitLatency=1
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=32
+master=system.l2_coalescer.slave[0]
+slave=system.sqc_coalescer.master[0]
+
+[system.sqc_tlb.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.sqc_tlb.clk_domain.voltage_domain
+
+[system.sqc_tlb.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.sys_port_proxy]
+type=RubyPortProxy
+clk_domain=system.clk_domain
+eventq_index=0
+is_cpu_sequencer=true
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_ruby_tester=false
+version=0
+slave=system.system_port
+
+[system.tcc_cntrl0]
+type=TCC_Controller
+children=L2cache probeFromNB requestFromTCP requestToNB responseFromNB responseToCore responseToNB triggerQueue unblockToNB
+L2cache=system.tcc_cntrl0.L2cache
+WB=false
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+eventq_index=0
+l2_request_latency=120
+l2_response_latency=16
+number_of_TBEs=5120
+probeFromNB=system.tcc_cntrl0.probeFromNB
+recycle_latency=10
+requestFromTCP=system.tcc_cntrl0.requestFromTCP
+requestToNB=system.tcc_cntrl0.requestToNB
+responseFromNB=system.tcc_cntrl0.responseFromNB
+responseToCore=system.tcc_cntrl0.responseToCore
+responseToNB=system.tcc_cntrl0.responseToNB
+ruby_system=system.ruby
+system=system
+transitions_per_cycle=32
+triggerQueue=system.tcc_cntrl0.triggerQueue
+unblockToNB=system.tcc_cntrl0.unblockToNB
+version=0
+
+[system.tcc_cntrl0.L2cache]
+type=RubyCache
+children=replacement_policy
+assoc=16
+block_size=0
+dataAccessLatency=8
+dataArrayBanks=256
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcc_cntrl0.L2cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=2097152
+start_index_bit=6
+tagAccessLatency=2
+tagArrayBanks=256
+
+[system.tcc_cntrl0.L2cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=16
+block_size=64
+eventq_index=0
+size=2097152
+
+[system.tcc_cntrl0.probeFromNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[12]
+
+[system.tcc_cntrl0.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[11]
+
+[system.tcc_cntrl0.requestToNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[13]
+
+[system.tcc_cntrl0.responseFromNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[13]
+
+[system.tcc_cntrl0.responseToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[12]
+
+[system.tcc_cntrl0.responseToNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[14]
+
+[system.tcc_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.tcc_cntrl0.unblockToNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[15]
+
+[system.tcp_cntrl0]
+type=TCP_Controller
+children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore
+L1cache=system.tcp_cntrl0.L1cache
+TCC_select_num_bits=0
+WB=false
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+coalescer=system.tcp_cntrl0.coalescer
+disableL1=false
+eventq_index=0
+issue_latency=1
+l2_hit_latency=18
+mandatoryQueue=system.tcp_cntrl0.mandatoryQueue
+number_of_TBEs=2560
+probeToTCP=system.tcp_cntrl0.probeToTCP
+recycle_latency=10
+requestFromTCP=system.tcp_cntrl0.requestFromTCP
+responseFromTCP=system.tcp_cntrl0.responseFromTCP
+responseToTCP=system.tcp_cntrl0.responseToTCP
+ruby_system=system.ruby
+sequencer=system.tcp_cntrl0.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.tcp_cntrl0.unblockFromCore
+use_seq_not_coal=false
+version=0
+
+[system.tcp_cntrl0.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=16
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcp_cntrl0.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=16384
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=16
+
+[system.tcp_cntrl0.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=16
+block_size=64
+eventq_index=0
+size=16384
+
+[system.tcp_cntrl0.coalescer]
+type=VIPERCoalescer
+assume_rfo=false
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl0.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl0.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_inv_per_cycle=32
+max_outstanding_requests=2560
+max_wb_per_cycle=32
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=false
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=2
+slave=system.cpu1.CUs0.memory_port[0] system.cpu1.CUs0.memory_port[1] system.cpu1.CUs0.memory_port[2] system.cpu1.CUs0.memory_port[3] system.cpu1.CUs0.memory_port[4] system.cpu1.CUs0.memory_port[5] system.cpu1.CUs0.memory_port[6] system.cpu1.CUs0.memory_port[7] system.cpu1.CUs0.memory_port[8] system.cpu1.CUs0.memory_port[9] system.cpu1.CUs0.memory_port[10] system.cpu1.CUs0.memory_port[11] system.cpu1.CUs0.memory_port[12] system.cpu1.CUs0.memory_port[13] system.cpu1.CUs0.memory_port[14] system.cpu1.CUs0.memory_port[15] system.cpu1.CUs0.memory_port[16] system.cpu1.CUs0.memory_port[17] system.cpu1.CUs0.memory_port[18] system.cpu1.CUs0.memory_port[19] system.cpu1.CUs0.memory_port[20] system.cpu1.CUs0.memory_port[21] system.cpu1.CUs0.memory_port[22] system.cpu1.CUs0.memory_port[23] system.cpu1.CUs0.memory_port[24] system.cpu1.CUs0.memory_port[25] system.cpu1.CUs0.memory_port[26] system.cpu1.CUs0.memory_port[27] system.cpu1.CUs0.memory_port[28] system.cpu1.CUs0.memory_port[29] system.cpu1.CUs0.memory_port[30] system.cpu1.CUs0.memory_port[31] system.cpu1.CUs0.memory_port[32] system.cpu1.CUs0.memory_port[33] system.cpu1.CUs0.memory_port[34] system.cpu1.CUs0.memory_port[35] system.cpu1.CUs0.memory_port[36] system.cpu1.CUs0.memory_port[37] system.cpu1.CUs0.memory_port[38] system.cpu1.CUs0.memory_port[39] system.cpu1.CUs0.memory_port[40] system.cpu1.CUs0.memory_port[41] system.cpu1.CUs0.memory_port[42] system.cpu1.CUs0.memory_port[43] system.cpu1.CUs0.memory_port[44] system.cpu1.CUs0.memory_port[45] system.cpu1.CUs0.memory_port[46] system.cpu1.CUs0.memory_port[47] system.cpu1.CUs0.memory_port[48] system.cpu1.CUs0.memory_port[49] system.cpu1.CUs0.memory_port[50] system.cpu1.CUs0.memory_port[51] system.cpu1.CUs0.memory_port[52] system.cpu1.CUs0.memory_port[53] system.cpu1.CUs0.memory_port[54] system.cpu1.CUs0.memory_port[55] system.cpu1.CUs0.memory_port[56] system.cpu1.CUs0.memory_port[57] system.cpu1.CUs0.memory_port[58] system.cpu1.CUs0.memory_port[59] system.cpu1.CUs0.memory_port[60] system.cpu1.CUs0.memory_port[61] system.cpu1.CUs0.memory_port[62] system.cpu1.CUs0.memory_port[63]
+
+[system.tcp_cntrl0.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.tcp_cntrl0.probeToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[5]
+
+[system.tcp_cntrl0.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[5]
+
+[system.tcp_cntrl0.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[6]
+
+[system.tcp_cntrl0.responseToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[6]
+
+[system.tcp_cntrl0.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl0.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl0.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=3
+
+[system.tcp_cntrl0.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[7]
+
+[system.tcp_cntrl1]
+type=TCP_Controller
+children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore
+L1cache=system.tcp_cntrl1.L1cache
+TCC_select_num_bits=0
+WB=false
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+coalescer=system.tcp_cntrl1.coalescer
+disableL1=false
+eventq_index=0
+issue_latency=1
+l2_hit_latency=18
+mandatoryQueue=system.tcp_cntrl1.mandatoryQueue
+number_of_TBEs=2560
+probeToTCP=system.tcp_cntrl1.probeToTCP
+recycle_latency=10
+requestFromTCP=system.tcp_cntrl1.requestFromTCP
+responseFromTCP=system.tcp_cntrl1.responseFromTCP
+responseToTCP=system.tcp_cntrl1.responseToTCP
+ruby_system=system.ruby
+sequencer=system.tcp_cntrl1.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.tcp_cntrl1.unblockFromCore
+use_seq_not_coal=false
+version=1
+
+[system.tcp_cntrl1.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=16
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcp_cntrl1.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=16384
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=16
+
+[system.tcp_cntrl1.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=16
+block_size=64
+eventq_index=0
+size=16384
+
+[system.tcp_cntrl1.coalescer]
+type=VIPERCoalescer
+assume_rfo=false
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl1.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl1.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_inv_per_cycle=32
+max_outstanding_requests=2560
+max_wb_per_cycle=32
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=false
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=4
+slave=system.cpu1.CUs1.memory_port[0] system.cpu1.CUs1.memory_port[1] system.cpu1.CUs1.memory_port[2] system.cpu1.CUs1.memory_port[3] system.cpu1.CUs1.memory_port[4] system.cpu1.CUs1.memory_port[5] system.cpu1.CUs1.memory_port[6] system.cpu1.CUs1.memory_port[7] system.cpu1.CUs1.memory_port[8] system.cpu1.CUs1.memory_port[9] system.cpu1.CUs1.memory_port[10] system.cpu1.CUs1.memory_port[11] system.cpu1.CUs1.memory_port[12] system.cpu1.CUs1.memory_port[13] system.cpu1.CUs1.memory_port[14] system.cpu1.CUs1.memory_port[15] system.cpu1.CUs1.memory_port[16] system.cpu1.CUs1.memory_port[17] system.cpu1.CUs1.memory_port[18] system.cpu1.CUs1.memory_port[19] system.cpu1.CUs1.memory_port[20] system.cpu1.CUs1.memory_port[21] system.cpu1.CUs1.memory_port[22] system.cpu1.CUs1.memory_port[23] system.cpu1.CUs1.memory_port[24] system.cpu1.CUs1.memory_port[25] system.cpu1.CUs1.memory_port[26] system.cpu1.CUs1.memory_port[27] system.cpu1.CUs1.memory_port[28] system.cpu1.CUs1.memory_port[29] system.cpu1.CUs1.memory_port[30] system.cpu1.CUs1.memory_port[31] system.cpu1.CUs1.memory_port[32] system.cpu1.CUs1.memory_port[33] system.cpu1.CUs1.memory_port[34] system.cpu1.CUs1.memory_port[35] system.cpu1.CUs1.memory_port[36] system.cpu1.CUs1.memory_port[37] system.cpu1.CUs1.memory_port[38] system.cpu1.CUs1.memory_port[39] system.cpu1.CUs1.memory_port[40] system.cpu1.CUs1.memory_port[41] system.cpu1.CUs1.memory_port[42] system.cpu1.CUs1.memory_port[43] system.cpu1.CUs1.memory_port[44] system.cpu1.CUs1.memory_port[45] system.cpu1.CUs1.memory_port[46] system.cpu1.CUs1.memory_port[47] system.cpu1.CUs1.memory_port[48] system.cpu1.CUs1.memory_port[49] system.cpu1.CUs1.memory_port[50] system.cpu1.CUs1.memory_port[51] system.cpu1.CUs1.memory_port[52] system.cpu1.CUs1.memory_port[53] system.cpu1.CUs1.memory_port[54] system.cpu1.CUs1.memory_port[55] system.cpu1.CUs1.memory_port[56] system.cpu1.CUs1.memory_port[57] system.cpu1.CUs1.memory_port[58] system.cpu1.CUs1.memory_port[59] system.cpu1.CUs1.memory_port[60] system.cpu1.CUs1.memory_port[61] system.cpu1.CUs1.memory_port[62] system.cpu1.CUs1.memory_port[63]
+
+[system.tcp_cntrl1.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.tcp_cntrl1.probeToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[7]
+
+[system.tcp_cntrl1.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[8]
+
+[system.tcp_cntrl1.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[9]
+
+[system.tcp_cntrl1.responseToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[8]
+
+[system.tcp_cntrl1.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl1.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl1.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=5
+
+[system.tcp_cntrl1.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[10]
+
+[system.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/simerr b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/simerr
new file mode 100755
index 000000000..1e2b8911e
--- /dev/null
+++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/simerr
@@ -0,0 +1,5 @@
+warn: system.ruby.network adopting orphan SimObject param 'int_links'
+warn: system.ruby.network adopting orphan SimObject param 'ext_links'
+warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes)
+warn: Sockets disabled, not accepting gdb connections
+warn: Replacement policy updates recently became the responsibility of SLICC state machines. Make sure to setMRU() near callbacks in .sm files!
diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/simout b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/simout
new file mode 100755
index 000000000..8e68d38e1
--- /dev/null
+++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/simout
@@ -0,0 +1,21 @@
+gem5 Simulator System.  http://gem5.org
+gem5 is copyrighted software; use the --copyright option for details.
+
+gem5 compiled Jan 19 2016 13:39:50
+gem5 started Jan 19 2016 13:40:22
+gem5 executing on zizzer, pid 50252
+command line: build/HSAIL_X86/gem5.opt -d build/HSAIL_X86/tests/opt/quick/se/04.gpu/x86/linux/gpu-ruby-GPU_VIPER_Baseline -re /z/atgutier/gem5/gem5-commit/tests/run.py build/HSAIL_X86/tests/opt/quick/se/04.gpu/x86/linux/gpu-ruby-GPU_VIPER_Baseline
+
+Using GPU kernel code file(s) /dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm
+Global frequency set at 1000000000000 ticks per second
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+info: Entering event queue @ 0.  Starting simulation...
+keys = 0x7b2bc0, &keys = 0x798998, keys[0] = 23
+the gpu says:
+elloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloe
+Exiting @ tick 548459500 because target called exit()
diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/stats.txt b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/stats.txt
new file mode 100644
index 000000000..281a367a9
--- /dev/null
+++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Baseline/stats.txt
@@ -0,0 +1,3200 @@
+
+---------- Begin Simulation Statistics ----------
+sim_seconds                                  0.000548                       # Number of seconds simulated
+sim_ticks                                   548459500                       # Number of ticks simulated
+final_tick                                  548459500                       # Number of ticks from beginning of simulation (restored from checkpoints and never reset)
+sim_freq                                 1000000000000                       # Frequency of simulated ticks
+host_inst_rate                                  76623                       # Simulator instruction rate (inst/s)
+host_op_rate                                   157567                       # Simulator op (including micro ops) rate (op/s)
+host_tick_rate                              627550839                       # Simulator tick rate (ticks/s)
+host_mem_usage                                1298164                       # Number of bytes of host memory used
+host_seconds                                     0.87                       # Real time elapsed on the host
+sim_insts                                       66963                       # Number of instructions simulated
+sim_ops                                        137705                       # Number of ops (including micro ops) simulated
+system.voltage_domain.voltage                       1                       # Voltage in Volts
+system.clk_domain.clock                          1000                       # Clock period in ticks
+system.mem_ctrls.bytes_read::dir_cntrl0         99840                       # Number of bytes read from this memory
+system.mem_ctrls.bytes_read::total              99840                       # Number of bytes read from this memory
+system.mem_ctrls.num_reads::dir_cntrl0           1560                       # Number of read requests responded to by this memory
+system.mem_ctrls.num_reads::total                1560                       # Number of read requests responded to by this memory
+system.mem_ctrls.bw_read::dir_cntrl0        182037142                       # Total read bandwidth from this memory (bytes/s)
+system.mem_ctrls.bw_read::total             182037142                       # Total read bandwidth from this memory (bytes/s)
+system.mem_ctrls.bw_total::dir_cntrl0       182037142                       # Total bandwidth to/from this memory (bytes/s)
+system.mem_ctrls.bw_total::total            182037142                       # Total bandwidth to/from this memory (bytes/s)
+system.mem_ctrls.readReqs                        1560                       # Number of read requests accepted
+system.mem_ctrls.writeReqs                          0                       # Number of write requests accepted
+system.mem_ctrls.readBursts                      1560                       # Number of DRAM read bursts, including those serviced by the write queue
+system.mem_ctrls.writeBursts                        0                       # Number of DRAM write bursts, including those merged in the write queue
+system.mem_ctrls.bytesReadDRAM                  99840                       # Total number of bytes read from DRAM
+system.mem_ctrls.bytesReadWrQ                       0                       # Total number of bytes read from write queue
+system.mem_ctrls.bytesWritten                       0                       # Total number of bytes written to DRAM
+system.mem_ctrls.bytesReadSys                   99840                       # Total read bytes from the system interface side
+system.mem_ctrls.bytesWrittenSys                    0                       # Total written bytes from the system interface side
+system.mem_ctrls.servicedByWrQ                      0                       # Number of DRAM read bursts serviced by the write queue
+system.mem_ctrls.mergedWrBursts                     0                       # Number of DRAM write bursts merged with an existing one
+system.mem_ctrls.neitherReadNorWriteReqs            0                       # Number of requests that are neither read nor write
+system.mem_ctrls.perBankRdBursts::0               122                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::1               192                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::2                93                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::3                44                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::4                61                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::5                79                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::6                52                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::7                42                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::8                54                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::9                56                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::10              182                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::11               90                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::12              223                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::13              125                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::14               51                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::15               94                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::0                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::1                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::2                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::3                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::4                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::5                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::6                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::7                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::8                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::9                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::10                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::11                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::12                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::13                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::14                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::15                0                       # Per bank write bursts
+system.mem_ctrls.numRdRetry                         0                       # Number of times read queue was full causing retry
+system.mem_ctrls.numWrRetry                         0                       # Number of times write queue was full causing retry
+system.mem_ctrls.totGap                     548231000                       # Total gap between requests
+system.mem_ctrls.readPktSize::0                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::1                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::2                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::3                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::4                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::5                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::6                  1560                       # Read request sizes (log2)
+system.mem_ctrls.writePktSize::0                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::1                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::2                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::3                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::4                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::5                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::6                    0                       # Write request sizes (log2)
+system.mem_ctrls.rdQLenPdf::0                    1545                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::1                       3                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::2                       2                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::3                       4                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::4                       5                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::5                       1                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::6                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::7                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::8                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::9                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::10                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::11                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::12                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::13                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::14                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::15                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::16                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::17                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::18                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::19                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::20                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::21                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::22                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::23                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::24                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::25                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::26                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::27                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::28                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::29                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::30                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::31                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::0                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::1                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::2                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::3                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::4                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::5                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::6                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::7                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::8                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::9                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::10                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::11                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::12                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::13                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::14                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::15                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::16                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::17                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::18                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::19                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::20                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::21                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::22                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::23                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::24                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::25                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::26                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::27                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::28                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::29                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::30                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::31                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::32                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::33                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::34                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::35                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::36                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::37                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::38                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::39                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::40                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::41                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::42                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::43                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::44                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::45                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::46                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::47                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::48                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::49                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::50                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::51                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::52                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::53                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::54                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::55                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::56                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::57                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::58                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::59                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::60                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::61                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::62                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::63                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.bytesPerActivate::samples          467                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::mean    212.008565                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::gmean   148.026325                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::stdev   209.604491                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::0-127          171     36.62%     36.62% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::128-255          154     32.98%     69.59% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::256-383           64     13.70%     83.30% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::384-511           31      6.64%     89.94% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::512-639           16      3.43%     93.36% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::640-767           12      2.57%     95.93% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::768-895            7      1.50%     97.43% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::896-1023            3      0.64%     98.07% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::1024-1151            9      1.93%    100.00% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::total          467                       # Bytes accessed per row activation
+system.mem_ctrls.totQLat                     15697750                       # Total ticks spent queuing
+system.mem_ctrls.totMemAccLat                44947750                       # Total ticks spent from burst creation until serviced by the DRAM
+system.mem_ctrls.totBusLat                    7800000                       # Total ticks spent in databus transfers
+system.mem_ctrls.avgQLat                     10062.66                       # Average queueing delay per DRAM burst
+system.mem_ctrls.avgBusLat                    5000.00                       # Average bus latency per DRAM burst
+system.mem_ctrls.avgMemAccLat                28812.66                       # Average memory access latency per DRAM burst
+system.mem_ctrls.avgRdBW                       182.04                       # Average DRAM read bandwidth in MiByte/s
+system.mem_ctrls.avgWrBW                         0.00                       # Average achieved write bandwidth in MiByte/s
+system.mem_ctrls.avgRdBWSys                    182.04                       # Average system read bandwidth in MiByte/s
+system.mem_ctrls.avgWrBWSys                      0.00                       # Average system write bandwidth in MiByte/s
+system.mem_ctrls.peakBW                      12800.00                       # Theoretical peak bandwidth in MiByte/s
+system.mem_ctrls.busUtil                         1.42                       # Data bus utilization in percentage
+system.mem_ctrls.busUtilRead                     1.42                       # Data bus utilization in percentage for reads
+system.mem_ctrls.busUtilWrite                    0.00                       # Data bus utilization in percentage for writes
+system.mem_ctrls.avgRdQLen                       1.01                       # Average read queue length when enqueuing
+system.mem_ctrls.avgWrQLen                       0.00                       # Average write queue length when enqueuing
+system.mem_ctrls.readRowHits                     1088                       # Number of row buffer hits during reads
+system.mem_ctrls.writeRowHits                       0                       # Number of row buffer hits during writes
+system.mem_ctrls.readRowHitRate                 69.74                       # Row buffer hit rate for reads
+system.mem_ctrls.writeRowHitRate                  nan                       # Row buffer hit rate for writes
+system.mem_ctrls.avgGap                     351430.13                       # Average gap between requests
+system.mem_ctrls.pageHitRate                    69.74                       # Row buffer hit rate, read and write combined
+system.mem_ctrls_0.actEnergy                  1323000                       # Energy for activate commands per rank (pJ)
+system.mem_ctrls_0.preEnergy                   721875                       # Energy for precharge commands per rank (pJ)
+system.mem_ctrls_0.readEnergy                 5335200                       # Energy for read commands per rank (pJ)
+system.mem_ctrls_0.writeEnergy                      0                       # Energy for write commands per rank (pJ)
+system.mem_ctrls_0.refreshEnergy             35599200                       # Energy for refresh commands per rank (pJ)
+system.mem_ctrls_0.actBackEnergy            300176820                       # Energy for active background per rank (pJ)
+system.mem_ctrls_0.preBackEnergy             63865500                       # Energy for precharge background per rank (pJ)
+system.mem_ctrls_0.totalEnergy              407021595                       # Total energy per rank (pJ)
+system.mem_ctrls_0.averagePower            746.421165                       # Core power per rank (mW)
+system.mem_ctrls_0.memoryStateTime::IDLE    107390750                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::REF      18200000                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::PRE_PDN            0                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::ACT     422764250                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::ACT_PDN            0                       # Time in different power states
+system.mem_ctrls_1.actEnergy                  2207520                       # Energy for activate commands per rank (pJ)
+system.mem_ctrls_1.preEnergy                  1204500                       # Energy for precharge commands per rank (pJ)
+system.mem_ctrls_1.readEnergy                 6731400                       # Energy for read commands per rank (pJ)
+system.mem_ctrls_1.writeEnergy                      0                       # Energy for write commands per rank (pJ)
+system.mem_ctrls_1.refreshEnergy             35599200                       # Energy for refresh commands per rank (pJ)
+system.mem_ctrls_1.actBackEnergy            328972365                       # Energy for active background per rank (pJ)
+system.mem_ctrls_1.preBackEnergy             38606250                       # Energy for precharge background per rank (pJ)
+system.mem_ctrls_1.totalEnergy              413321235                       # Total energy per rank (pJ)
+system.mem_ctrls_1.averagePower            757.973831                       # Core power per rank (mW)
+system.mem_ctrls_1.memoryStateTime::IDLE     62414250                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::REF      18200000                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::PRE_PDN            0                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::ACT     464697000                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::ACT_PDN            0                       # Time in different power states
+system.ruby.clk_domain.clock                      500                       # Clock period in ticks
+system.ruby.phys_mem.bytes_read::cpu0.inst       696760                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_read::cpu0.data       119832                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_read::cpu1.CUs0.ComputeUnit         3280                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_read::cpu1.CUs1.ComputeUnit         3280                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_read::total         823152                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_inst_read::cpu0.inst       696760                       # Number of instructions bytes read from this memory
+system.ruby.phys_mem.bytes_inst_read::cpu1.CUs0.ComputeUnit         2000                       # Number of instructions bytes read from this memory
+system.ruby.phys_mem.bytes_inst_read::cpu1.CUs1.ComputeUnit         2000                       # Number of instructions bytes read from this memory
+system.ruby.phys_mem.bytes_inst_read::total       700760                       # Number of instructions bytes read from this memory
+system.ruby.phys_mem.bytes_written::cpu0.data        72767                       # Number of bytes written to this memory
+system.ruby.phys_mem.bytes_written::cpu1.CUs0.ComputeUnit          256                       # Number of bytes written to this memory
+system.ruby.phys_mem.bytes_written::cpu1.CUs1.ComputeUnit          256                       # Number of bytes written to this memory
+system.ruby.phys_mem.bytes_written::total        73279                       # Number of bytes written to this memory
+system.ruby.phys_mem.num_reads::cpu0.inst        87095                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_reads::cpu0.data        16686                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_reads::cpu1.CUs0.ComputeUnit          555                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_reads::cpu1.CUs1.ComputeUnit          555                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_reads::total          104891                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_writes::cpu0.data        10422                       # Number of write requests responded to by this memory
+system.ruby.phys_mem.num_writes::cpu1.CUs0.ComputeUnit          256                       # Number of write requests responded to by this memory
+system.ruby.phys_mem.num_writes::cpu1.CUs1.ComputeUnit          256                       # Number of write requests responded to by this memory
+system.ruby.phys_mem.num_writes::total          10934                       # Number of write requests responded to by this memory
+system.ruby.phys_mem.bw_read::cpu0.inst    1270394623                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_read::cpu0.data     218488330                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_read::cpu1.CUs0.ComputeUnit      5980387                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_read::cpu1.CUs1.ComputeUnit      5980387                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_read::total        1500843727                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_inst_read::cpu0.inst   1270394623                       # Instruction read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_inst_read::cpu1.CUs0.ComputeUnit      3646577                       # Instruction read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_inst_read::cpu1.CUs1.ComputeUnit      3646577                       # Instruction read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_inst_read::total   1277687778                       # Instruction read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_write::cpu0.data    132675248                       # Write bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_write::cpu1.CUs0.ComputeUnit       466762                       # Write bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_write::cpu1.CUs1.ComputeUnit       466762                       # Write bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_write::total        133608771                       # Write bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::cpu0.inst   1270394623                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::cpu0.data    351163577                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::cpu1.CUs0.ComputeUnit      6447149                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::cpu1.CUs1.ComputeUnit      6447149                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::total       1634452498                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.outstanding_req_hist::bucket_size            1                      
+system.ruby.outstanding_req_hist::max_bucket            9                      
+system.ruby.outstanding_req_hist::samples       114203                      
+system.ruby.outstanding_req_hist::mean       1.000035                      
+system.ruby.outstanding_req_hist::gmean      1.000024                      
+system.ruby.outstanding_req_hist::stdev      0.005918                      
+system.ruby.outstanding_req_hist         |           0      0.00%      0.00% |      114199    100.00%    100.00% |           4      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.outstanding_req_hist::total        114203                      
+system.ruby.latency_hist::bucket_size              64                      
+system.ruby.latency_hist::max_bucket              639                      
+system.ruby.latency_hist::samples              114203                      
+system.ruby.latency_hist::mean               3.766924                      
+system.ruby.latency_hist::gmean              1.075767                      
+system.ruby.latency_hist::stdev             23.927354                      
+system.ruby.latency_hist                 |      112668     98.66%     98.66% |           0      0.00%     98.66% |           0      0.00%     98.66% |        1489      1.30%     99.96% |          10      0.01%     99.97% |          13      0.01%     99.98% |          16      0.01%     99.99% |           7      0.01%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.latency_hist::total                114203                      
+system.ruby.hit_latency_hist::bucket_size           64                      
+system.ruby.hit_latency_hist::max_bucket          639                      
+system.ruby.hit_latency_hist::samples            1535                      
+system.ruby.hit_latency_hist::mean         206.165472                      
+system.ruby.hit_latency_hist::gmean        204.491657                      
+system.ruby.hit_latency_hist::stdev         32.551053                      
+system.ruby.hit_latency_hist             |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |        1489     97.00%     97.00% |          10      0.65%     97.65% |          13      0.85%     98.50% |          16      1.04%     99.54% |           7      0.46%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.hit_latency_hist::total              1535                      
+system.ruby.miss_latency_hist::bucket_size            2                      
+system.ruby.miss_latency_hist::max_bucket           19                      
+system.ruby.miss_latency_hist::samples         112668                      
+system.ruby.miss_latency_hist::mean          1.009426                      
+system.ruby.miss_latency_hist::gmean         1.001543                      
+system.ruby.miss_latency_hist::stdev         0.411800                      
+system.ruby.miss_latency_hist            |      112609     99.95%     99.95% |           0      0.00%     99.95% |           0      0.00%     99.95% |           0      0.00%     99.95% |           0      0.00%     99.95% |           0      0.00%     99.95% |           0      0.00%     99.95% |           0      0.00%     99.95% |           0      0.00%     99.95% |          59      0.05%    100.00%
+system.ruby.miss_latency_hist::total           112668                      
+system.ruby.L1Cache.incomplete_times           112609                      
+system.ruby.L2Cache.incomplete_times               59                      
+system.cp_cntrl0.L1D0cache.demand_hits              0                       # Number of cache demand hits
+system.cp_cntrl0.L1D0cache.demand_misses          506                       # Number of cache demand misses
+system.cp_cntrl0.L1D0cache.demand_accesses          506                       # Number of cache demand accesses
+system.cp_cntrl0.L1D0cache.num_data_array_reads        16155                       # number of data array reads
+system.cp_cntrl0.L1D0cache.num_data_array_writes        11985                       # number of data array writes
+system.cp_cntrl0.L1D0cache.num_tag_array_reads        27132                       # number of tag array reads
+system.cp_cntrl0.L1D0cache.num_tag_array_writes         1584                       # number of tag array writes
+system.cp_cntrl0.L1D1cache.demand_hits              0                       # Number of cache demand hits
+system.cp_cntrl0.L1D1cache.demand_misses            0                       # Number of cache demand misses
+system.cp_cntrl0.L1D1cache.demand_accesses            0                       # Number of cache demand accesses
+system.cp_cntrl0.L1Icache.demand_hits               0                       # Number of cache demand hits
+system.cp_cntrl0.L1Icache.demand_misses          1088                       # Number of cache demand misses
+system.cp_cntrl0.L1Icache.demand_accesses         1088                       # Number of cache demand accesses
+system.cp_cntrl0.L1Icache.num_data_array_reads        86007                       # number of data array reads
+system.cp_cntrl0.L1Icache.num_data_array_writes           54                       # number of data array writes
+system.cp_cntrl0.L1Icache.num_tag_array_reads        87684                       # number of tag array reads
+system.cp_cntrl0.L1Icache.num_tag_array_writes           54                       # number of tag array writes
+system.cp_cntrl0.L2cache.demand_hits                0                       # Number of cache demand hits
+system.cp_cntrl0.L2cache.demand_misses           1535                       # Number of cache demand misses
+system.cp_cntrl0.L2cache.demand_accesses         1535                       # Number of cache demand accesses
+system.cp_cntrl0.L2cache.num_data_array_reads          120                       # number of data array reads
+system.cp_cntrl0.L2cache.num_data_array_writes        11982                       # number of data array writes
+system.cp_cntrl0.L2cache.num_tag_array_reads        12046                       # number of tag array reads
+system.cp_cntrl0.L2cache.num_tag_array_writes         1641                       # number of tag array writes
+system.cpu0.clk_domain.clock                      500                       # Clock period in ticks
+system.cpu0.apic_clk_domain.clock                8000                       # Clock period in ticks
+system.cpu0.workload.num_syscalls                  21                       # Number of system calls
+system.cpu0.numCycles                         1096919                       # number of cpu cycles simulated
+system.cpu0.numWorkItemsStarted                     0                       # number of work items this cpu started
+system.cpu0.numWorkItemsCompleted                   0                       # number of work items this cpu completed
+system.cpu0.committedInsts                      66963                       # Number of instructions committed
+system.cpu0.committedOps                       137705                       # Number of ops (including micro ops) committed
+system.cpu0.num_int_alu_accesses               136380                       # Number of integer alu accesses
+system.cpu0.num_fp_alu_accesses                  1279                       # Number of float alu accesses
+system.cpu0.num_func_calls                       3196                       # number of times a function call or return occured
+system.cpu0.num_conditional_control_insts        12151                       # number of instructions that are conditional controls
+system.cpu0.num_int_insts                      136380                       # number of integer instructions
+system.cpu0.num_fp_insts                         1279                       # number of float instructions
+system.cpu0.num_int_register_reads             257490                       # number of times the integer registers were read
+system.cpu0.num_int_register_writes            110039                       # number of times the integer registers were written
+system.cpu0.num_fp_register_reads                1981                       # number of times the floating registers were read
+system.cpu0.num_fp_register_writes                981                       # number of times the floating registers were written
+system.cpu0.num_cc_register_reads               78262                       # number of times the CC registers were read
+system.cpu0.num_cc_register_writes              42183                       # number of times the CC registers were written
+system.cpu0.num_mem_refs                        27198                       # number of memory refs
+system.cpu0.num_load_insts                      16684                       # Number of load instructions
+system.cpu0.num_store_insts                     10514                       # Number of store instructions
+system.cpu0.num_idle_cycles               7577.003986                       # Number of idle cycles
+system.cpu0.num_busy_cycles              1089341.996014                       # Number of busy cycles
+system.cpu0.not_idle_fraction                0.993092                       # Percentage of non-idle cycles
+system.cpu0.idle_fraction                    0.006908                       # Percentage of idle cycles
+system.cpu0.Branches                            16199                       # Number of branches fetched
+system.cpu0.op_class::No_OpClass                  615      0.45%      0.45% # Class of executed instruction
+system.cpu0.op_class::IntAlu                   108791     79.00%     79.45% # Class of executed instruction
+system.cpu0.op_class::IntMult                      13      0.01%     79.46% # Class of executed instruction
+system.cpu0.op_class::IntDiv                      138      0.10%     79.56% # Class of executed instruction
+system.cpu0.op_class::FloatAdd                    950      0.69%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatCmp                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatCvt                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatMult                     0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatDiv                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatSqrt                     0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdAdd                       0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdAddAcc                    0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdAlu                       0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdCmp                       0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdCvt                       0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdMisc                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdMult                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdMultAcc                   0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdShift                     0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdShiftAcc                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdSqrt                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatAdd                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatAlu                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatCmp                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatCvt                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatDiv                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatMisc                 0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatMult                 0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatMultAcc              0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatSqrt                 0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::MemRead                   16684     12.12%     92.36% # Class of executed instruction
+system.cpu0.op_class::MemWrite                  10514      7.64%    100.00% # Class of executed instruction
+system.cpu0.op_class::IprAccess                     0      0.00%    100.00% # Class of executed instruction
+system.cpu0.op_class::InstPrefetch                  0      0.00%    100.00% # Class of executed instruction
+system.cpu0.op_class::total                    137705                       # Class of executed instruction
+system.cpu1.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.cpu1.clk_domain.clock                     1000                       # Clock period in ticks
+system.cpu1.CUs0.wavefronts00.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts00.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts00.timesBlockedDueRAWDependencies          372                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::samples           39                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::mean     0.794872                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::stdev     0.863880                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::0-1           28     71.79%     71.79% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::2-3           11     28.21%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::total           39                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::samples           39                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::mean     0.589744                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::stdev     0.498310                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::0-1           39    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::total           39                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts01.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts01.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts02.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts02.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts03.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts03.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts04.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts04.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts05.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts05.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts06.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts06.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts07.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts07.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts08.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts08.timesBlockedDueRAWDependencies          353                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts09.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts09.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts10.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts10.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts11.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts11.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts12.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts12.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts13.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts13.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts14.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts14.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts15.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts15.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts16.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts16.timesBlockedDueRAWDependencies          344                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts17.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts17.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts18.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts18.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts19.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts19.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts20.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts20.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts21.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts21.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts22.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts22.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts23.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts23.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts24.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts24.timesBlockedDueRAWDependencies          329                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts25.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts25.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts26.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts26.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts27.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts27.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts28.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts28.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts29.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts29.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts30.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts30.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts31.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts31.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::samples           43                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::mean     5.813953                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::stdev     2.683777                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::underflows            0      0.00%      0.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::1            0      0.00%      0.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::2            8     18.60%     18.60% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::3            8     18.60%     37.21% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::4            1      2.33%     39.53% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::5            0      0.00%     39.53% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::6            1      2.33%     41.86% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::7            0      0.00%     41.86% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::8           25     58.14%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::9            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::10            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::11            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::12            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::13            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::14            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::15            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::16            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::17            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::18            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::19            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::20            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::21            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::22            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::23            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::24            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::25            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::26            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::27            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::28            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::29            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::30            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::31            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::32            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::overflows            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::min_value            2                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::max_value            8                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::total           43                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.ExecStage.num_cycles_with_no_issue         4357                       # number of cycles the CU issues nothing
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_issued          133                       # number of cycles the CU issued at least one instruction
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU0           30                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU1           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU2           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU3           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::GM           18                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::LM            6                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU0         1547                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU1          483                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU2          439                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU3          403                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::GM          436                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::LM           26                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.spc::samples          4490                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::mean         0.031403                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::stdev        0.185563                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::underflows            0      0.00%      0.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::0                4357     97.04%     97.04% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::1                 126      2.81%     99.84% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::2                   6      0.13%     99.98% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::3                   1      0.02%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::4                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::5                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::6                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::overflows            0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::min_value            0                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::max_value            3                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::total            4490                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.num_transitions_active_to_idle           68                       # number of CU transitions from active to idle
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::samples           68                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::mean    59.558824                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::stdev   213.072854                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::underflows            0      0.00%      0.00% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::0-4           48     70.59%     70.59% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::5-9            8     11.76%     82.35% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::10-14            1      1.47%     83.82% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::15-19            1      1.47%     85.29% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::20-24            2      2.94%     88.24% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::25-29            1      1.47%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::30-34            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::35-39            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::40-44            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::45-49            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::50-54            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::55-59            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::60-64            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::65-69            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::70-74            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::75            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::overflows            7     10.29%    100.00% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::min_value            1                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::max_value         1300                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::total           68                       # duration of idle periods in cycles
+system.cpu1.CUs0.GlobalMemPipeline.load_vrf_bank_conflict_cycles            0                       # total number of cycles GM data are delayed before updating the VRF
+system.cpu1.CUs0.LocalMemPipeline.load_vrf_bank_conflict_cycles            0                       # total number of cycles LDS data are delayed before updating the VRF
+system.cpu1.CUs0.tlb_requests                     769                       # number of uncoalesced requests
+system.cpu1.CUs0.tlb_cycles              -373675448000                       # total number of cycles for all uncoalesced requests
+system.cpu1.CUs0.avg_translation_latency -485923859.557867                       # Avg. translation latency for data translations
+system.cpu1.CUs0.TLB_hits_distribution::page_table          769                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs0.TLB_hits_distribution::L1_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs0.TLB_hits_distribution::L2_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs0.TLB_hits_distribution::L3_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs0.lds_bank_access_cnt               54                       # Total number of LDS bank accesses
+system.cpu1.CUs0.lds_bank_conflicts::samples            6                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::mean            8                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::stdev     6.196773                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::underflows            0      0.00%      0.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::0-1            2     33.33%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::2-3            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::4-5            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::6-7            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::8-9            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::10-11            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::12-13            4     66.67%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::14-15            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::16-17            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::18-19            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::20-21            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::22-23            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::24-25            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::26-27            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::28-29            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::30-31            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::32-33            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::34-35            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::36-37            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::38-39            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::40-41            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::42-43            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::44-45            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::46-47            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::48-49            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::50-51            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::52-53            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::54-55            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::56-57            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::58-59            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::60-61            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::62-63            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::64             0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::overflows            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::min_value            0                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::max_value           12                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::total            6                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.page_divergence_dist::samples           17                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::mean            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::stdev            0                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::underflows            0      0.00%      0.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::1-4           17    100.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::5-8            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::9-12            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::13-16            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::17-20            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::21-24            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::25-28            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::29-32            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::33-36            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::37-40            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::41-44            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::45-48            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::49-52            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::53-56            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::57-60            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::61-64            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::overflows            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::min_value            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::max_value            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::total           17                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.global_mem_instr_cnt              17                       # dynamic global memory instructions count
+system.cpu1.CUs0.local_mem_instr_cnt                6                       # dynamic local memory intruction count
+system.cpu1.CUs0.wg_blocked_due_lds_alloc            0                       # Workgroup blocked due to LDS capacity
+system.cpu1.CUs0.num_instr_executed               141                       # number of instructions executed
+system.cpu1.CUs0.inst_exec_rate::samples          141                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::mean       94.900709                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::stdev     247.493154                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::underflows            0      0.00%      0.00% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::0-1                1      0.71%      0.71% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::2-3               12      8.51%      9.22% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::4-5               53     37.59%     46.81% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::6-7               31     21.99%     68.79% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::8-9                3      2.13%     70.92% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::10                 1      0.71%     71.63% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::overflows           40     28.37%    100.00% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::min_value            1                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::max_value         1303                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::total            141                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.num_vec_ops_executed            6769                       # number of vec ops executed (e.g. VSZ/inst)
+system.cpu1.CUs0.num_total_cycles                4490                       # number of cycles the CU ran for
+system.cpu1.CUs0.vpc                         1.507572                       # Vector Operations per cycle (this CU only)
+system.cpu1.CUs0.ipc                         0.031403                       # Instructions per cycle (this CU only)
+system.cpu1.CUs0.warp_execution_dist::samples          141                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::mean    48.007092                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::stdev    23.719942                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::underflows            0      0.00%      0.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::1-4            5      3.55%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::5-8            0      0.00%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::9-12            0      0.00%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::13-16           36     25.53%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::17-20            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::21-24            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::25-28            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::29-32            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::33-36            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::37-40            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::41-44            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::45-48            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::49-52            8      5.67%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::53-56            0      0.00%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::57-60            0      0.00%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::61-64           92     65.25%    100.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::overflows            0      0.00%    100.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::min_value            1                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::max_value           64                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::total          141                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.gmem_lanes_execution_dist::samples           18                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::mean    37.833333                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::stdev    27.064737                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::underflows            0      0.00%      0.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::1-4            1      5.56%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::5-8            0      0.00%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::9-12            0      0.00%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::13-16            8     44.44%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::17-20            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::21-24            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::25-28            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::29-32            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::33-36            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::37-40            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::41-44            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::45-48            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::49-52            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::53-56            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::57-60            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::61-64            9     50.00%    100.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::overflows            0      0.00%    100.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::min_value            1                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::max_value           64                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::total           18                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::samples            6                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::mean    19.500000                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::stdev    22.322634                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::underflows            0      0.00%      0.00% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::1-4            1     16.67%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::5-8            0      0.00%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::9-12            0      0.00%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::13-16            4     66.67%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::17-20            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::21-24            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::25-28            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::29-32            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::33-36            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::37-40            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::41-44            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::45-48            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::49-52            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::53-56            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::57-60            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::61-64            1     16.67%    100.00% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::overflows            0      0.00%    100.00% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::min_value            1                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::max_value           64                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::total            6                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.num_alu_insts_executed           118                       # Number of dynamic non-GM memory insts executed
+system.cpu1.CUs0.times_wg_blocked_due_vgpr_alloc            0                       # Number of times WGs are blocked due to VGPR allocation per SIMD
+system.cpu1.CUs0.num_CAS_ops                        0                       # number of compare and swap operations
+system.cpu1.CUs0.num_failed_CAS_ops                 0                       # number of compare and swap operations that failed
+system.cpu1.CUs0.num_completed_wfs                  4                       # number of completed wavefronts
+system.cpu1.CUs1.wavefronts00.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts00.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts00.timesBlockedDueRAWDependencies          377                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::samples           39                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::mean     0.794872                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::stdev     0.863880                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::0-1           28     71.79%     71.79% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::2-3           11     28.21%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::total           39                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::samples           39                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::mean     0.589744                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::stdev     0.498310                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::0-1           39    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::total           39                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts01.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts01.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts02.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts02.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts03.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts03.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts04.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts04.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts05.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts05.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts06.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts06.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts07.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts07.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts08.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts08.timesBlockedDueRAWDependencies          355                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts09.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts09.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts10.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts10.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts11.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts11.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts12.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts12.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts13.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts13.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts14.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts14.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts15.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts15.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts16.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts16.timesBlockedDueRAWDependencies          352                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts17.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts17.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts18.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts18.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts19.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts19.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts20.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts20.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts21.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts21.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts22.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts22.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts23.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts23.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts24.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts24.timesBlockedDueRAWDependencies          337                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts25.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts25.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts26.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts26.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts27.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts27.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts28.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts28.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts29.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts29.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts30.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts30.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts31.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts31.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::samples           43                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::mean     5.813953                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::stdev     2.683777                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::underflows            0      0.00%      0.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::1            0      0.00%      0.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::2            8     18.60%     18.60% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::3            8     18.60%     37.21% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::4            1      2.33%     39.53% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::5            0      0.00%     39.53% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::6            1      2.33%     41.86% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::7            0      0.00%     41.86% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::8           25     58.14%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::9            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::10            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::11            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::12            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::13            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::14            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::15            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::16            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::17            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::18            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::19            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::20            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::21            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::22            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::23            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::24            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::25            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::26            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::27            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::28            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::29            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::30            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::31            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::32            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::overflows            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::min_value            2                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::max_value            8                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::total           43                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.ExecStage.num_cycles_with_no_issue         4359                       # number of cycles the CU issues nothing
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_issued          131                       # number of cycles the CU issued at least one instruction
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU0           30                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU1           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU2           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU3           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::GM           18                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::LM            6                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU0         1552                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU1          447                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU2          464                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU3          464                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::GM          426                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::LM           33                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.spc::samples          4490                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::mean         0.031403                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::stdev        0.189130                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::underflows            0      0.00%      0.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::0                4359     97.08%     97.08% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::1                 123      2.74%     99.82% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::2                   6      0.13%     99.96% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::3                   2      0.04%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::4                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::5                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::6                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::overflows            0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::min_value            0                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::max_value            3                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::total            4490                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.num_transitions_active_to_idle           74                       # number of CU transitions from active to idle
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::samples           74                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::mean    55.324324                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::stdev   207.911408                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::underflows            0      0.00%      0.00% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::0-4           56     75.68%     75.68% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::5-9            7      9.46%     85.14% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::10-14            0      0.00%     85.14% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::15-19            2      2.70%     87.84% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::20-24            1      1.35%     89.19% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::25-29            1      1.35%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::30-34            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::35-39            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::40-44            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::45-49            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::50-54            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::55-59            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::60-64            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::65-69            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::70-74            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::75            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::overflows            7      9.46%    100.00% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::min_value            1                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::max_value         1304                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::total           74                       # duration of idle periods in cycles
+system.cpu1.CUs1.GlobalMemPipeline.load_vrf_bank_conflict_cycles            0                       # total number of cycles GM data are delayed before updating the VRF
+system.cpu1.CUs1.LocalMemPipeline.load_vrf_bank_conflict_cycles            0                       # total number of cycles LDS data are delayed before updating the VRF
+system.cpu1.CUs1.tlb_requests                     769                       # number of uncoalesced requests
+system.cpu1.CUs1.tlb_cycles              -373672588000                       # total number of cycles for all uncoalesced requests
+system.cpu1.CUs1.avg_translation_latency -485920140.442133                       # Avg. translation latency for data translations
+system.cpu1.CUs1.TLB_hits_distribution::page_table          769                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs1.TLB_hits_distribution::L1_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs1.TLB_hits_distribution::L2_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs1.TLB_hits_distribution::L3_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs1.lds_bank_access_cnt               53                       # Total number of LDS bank accesses
+system.cpu1.CUs1.lds_bank_conflicts::samples            6                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::mean     7.833333                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::stdev     6.080022                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::underflows            0      0.00%      0.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::0-1            2     33.33%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::2-3            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::4-5            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::6-7            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::8-9            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::10-11            1     16.67%     50.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::12-13            3     50.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::14-15            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::16-17            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::18-19            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::20-21            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::22-23            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::24-25            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::26-27            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::28-29            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::30-31            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::32-33            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::34-35            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::36-37            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::38-39            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::40-41            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::42-43            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::44-45            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::46-47            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::48-49            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::50-51            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::52-53            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::54-55            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::56-57            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::58-59            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::60-61            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::62-63            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::64             0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::overflows            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::min_value            0                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::max_value           12                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::total            6                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.page_divergence_dist::samples           17                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::mean            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::stdev            0                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::underflows            0      0.00%      0.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::1-4           17    100.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::5-8            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::9-12            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::13-16            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::17-20            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::21-24            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::25-28            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::29-32            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::33-36            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::37-40            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::41-44            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::45-48            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::49-52            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::53-56            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::57-60            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::61-64            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::overflows            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::min_value            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::max_value            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::total           17                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.global_mem_instr_cnt              17                       # dynamic global memory instructions count
+system.cpu1.CUs1.local_mem_instr_cnt                6                       # dynamic local memory intruction count
+system.cpu1.CUs1.wg_blocked_due_lds_alloc            0                       # Workgroup blocked due to LDS capacity
+system.cpu1.CUs1.num_instr_executed               141                       # number of instructions executed
+system.cpu1.CUs1.inst_exec_rate::samples          141                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::mean       95.106383                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::stdev     249.293307                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::underflows            0      0.00%      0.00% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::0-1                1      0.71%      0.71% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::2-3               12      8.51%      9.22% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::4-5               53     37.59%     46.81% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::6-7               29     20.57%     67.38% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::8-9                5      3.55%     70.92% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::10                 1      0.71%     71.63% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::overflows           40     28.37%    100.00% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::min_value            1                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::max_value         1307                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::total            141                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.num_vec_ops_executed            6762                       # number of vec ops executed (e.g. VSZ/inst)
+system.cpu1.CUs1.num_total_cycles                4490                       # number of cycles the CU ran for
+system.cpu1.CUs1.vpc                         1.506013                       # Vector Operations per cycle (this CU only)
+system.cpu1.CUs1.ipc                         0.031403                       # Instructions per cycle (this CU only)
+system.cpu1.CUs1.warp_execution_dist::samples          141                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::mean    47.957447                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::stdev    23.818022                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::underflows            0      0.00%      0.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::1-4            5      3.55%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::5-8            0      0.00%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::9-12            9      6.38%      9.93% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::13-16           27     19.15%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::17-20            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::21-24            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::25-28            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::29-32            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::33-36            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::37-40            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::41-44            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::45-48            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::49-52            8      5.67%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::53-56            0      0.00%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::57-60            0      0.00%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::61-64           92     65.25%    100.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::overflows            0      0.00%    100.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::min_value            1                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::max_value           64                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::total          141                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.gmem_lanes_execution_dist::samples           18                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::mean    37.722222                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::stdev    27.174394                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::underflows            0      0.00%      0.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::1-4            1      5.56%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::5-8            0      0.00%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::9-12            2     11.11%     16.67% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::13-16            6     33.33%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::17-20            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::21-24            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::25-28            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::29-32            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::33-36            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::37-40            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::41-44            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::45-48            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::49-52            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::53-56            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::57-60            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::61-64            9     50.00%    100.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::overflows            0      0.00%    100.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::min_value            1                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::max_value           64                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::total           18                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::samples            6                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::mean    19.333333                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::stdev    22.384518                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::underflows            0      0.00%      0.00% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::1-4            1     16.67%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::5-8            0      0.00%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::9-12            1     16.67%     33.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::13-16            3     50.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::17-20            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::21-24            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::25-28            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::29-32            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::33-36            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::37-40            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::41-44            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::45-48            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::49-52            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::53-56            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::57-60            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::61-64            1     16.67%    100.00% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::overflows            0      0.00%    100.00% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::min_value            1                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::max_value           64                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::total            6                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.num_alu_insts_executed           118                       # Number of dynamic non-GM memory insts executed
+system.cpu1.CUs1.times_wg_blocked_due_vgpr_alloc            0                       # Number of times WGs are blocked due to VGPR allocation per SIMD
+system.cpu1.CUs1.num_CAS_ops                        0                       # number of compare and swap operations
+system.cpu1.CUs1.num_failed_CAS_ops                 0                       # number of compare and swap operations that failed
+system.cpu1.CUs1.num_completed_wfs                  4                       # number of completed wavefronts
+system.cpu2.num_kernel_launched                     1                       # number of kernel launched
+system.dir_cntrl0.L3CacheMemory.demand_hits            0                       # Number of cache demand hits
+system.dir_cntrl0.L3CacheMemory.demand_misses            0                       # Number of cache demand misses
+system.dir_cntrl0.L3CacheMemory.demand_accesses            0                       # Number of cache demand accesses
+system.dir_cntrl0.L3CacheMemory.num_data_array_writes         1560                       # number of data array writes
+system.dir_cntrl0.L3CacheMemory.num_tag_array_reads         1560                       # number of tag array reads
+system.dir_cntrl0.L3CacheMemory.num_tag_array_writes         1578                       # number of tag array writes
+system.dir_cntrl0.ProbeFilterMemory.demand_hits            0                       # Number of cache demand hits
+system.dir_cntrl0.ProbeFilterMemory.demand_misses            0                       # Number of cache demand misses
+system.dir_cntrl0.ProbeFilterMemory.demand_accesses            0                       # Number of cache demand accesses
+system.dir_cntrl0.ProbeFilterMemory.num_tag_array_reads         1560                       # number of tag array reads
+system.dir_cntrl0.ProbeFilterMemory.num_tag_array_writes         1560                       # number of tag array writes
+system.dispatcher_coalescer.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.dispatcher_coalescer.clk_domain.clock         1000                       # Clock period in ticks
+system.dispatcher_coalescer.uncoalesced_accesses            0                       # Number of uncoalesced TLB accesses
+system.dispatcher_coalescer.coalesced_accesses            0                       # Number of coalesced TLB accesses
+system.dispatcher_coalescer.queuing_cycles            0                       # Number of cycles spent in queue
+system.dispatcher_coalescer.local_queuing_cycles            0                       # Number of cycles spent in queue for all incoming reqs
+system.dispatcher_coalescer.local_latency          nan                       # Avg. latency over all incoming pkts
+system.dispatcher_tlb.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.dispatcher_tlb.clk_domain.clock           1000                       # Clock period in ticks
+system.dispatcher_tlb.local_TLB_accesses            0                       # Number of TLB accesses
+system.dispatcher_tlb.local_TLB_hits                0                       # Number of TLB hits
+system.dispatcher_tlb.local_TLB_misses              0                       # Number of TLB misses
+system.dispatcher_tlb.local_TLB_miss_rate          nan                       # TLB miss rate
+system.dispatcher_tlb.global_TLB_accesses            0                       # Number of TLB accesses
+system.dispatcher_tlb.global_TLB_hits               0                       # Number of TLB hits
+system.dispatcher_tlb.global_TLB_misses             0                       # Number of TLB misses
+system.dispatcher_tlb.global_TLB_miss_rate          nan                       # TLB miss rate
+system.dispatcher_tlb.access_cycles                 0                       # Cycles spent accessing this TLB level
+system.dispatcher_tlb.page_table_cycles             0                       # Cycles spent accessing the page table
+system.dispatcher_tlb.unique_pages                  0                       # Number of unique pages touched
+system.dispatcher_tlb.local_cycles                  0                       # Number of cycles spent in queue for all incoming reqs
+system.dispatcher_tlb.local_latency               nan                       # Avg. latency over incoming coalesced reqs
+system.dispatcher_tlb.avg_reuse_distance            0                       # avg. reuse distance over all pages (in ticks)
+system.l1_coalescer0.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l1_coalescer0.clk_domain.clock            1000                       # Clock period in ticks
+system.l1_coalescer0.uncoalesced_accesses          778                       # Number of uncoalesced TLB accesses
+system.l1_coalescer0.coalesced_accesses             0                       # Number of coalesced TLB accesses
+system.l1_coalescer0.queuing_cycles                 0                       # Number of cycles spent in queue
+system.l1_coalescer0.local_queuing_cycles            0                       # Number of cycles spent in queue for all incoming reqs
+system.l1_coalescer0.local_latency                  0                       # Avg. latency over all incoming pkts
+system.l1_coalescer1.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l1_coalescer1.clk_domain.clock            1000                       # Clock period in ticks
+system.l1_coalescer1.uncoalesced_accesses          769                       # Number of uncoalesced TLB accesses
+system.l1_coalescer1.coalesced_accesses             0                       # Number of coalesced TLB accesses
+system.l1_coalescer1.queuing_cycles                 0                       # Number of cycles spent in queue
+system.l1_coalescer1.local_queuing_cycles            0                       # Number of cycles spent in queue for all incoming reqs
+system.l1_coalescer1.local_latency                  0                       # Avg. latency over all incoming pkts
+system.l1_tlb0.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l1_tlb0.clk_domain.clock                  1000                       # Clock period in ticks
+system.l1_tlb0.local_TLB_accesses                 778                       # Number of TLB accesses
+system.l1_tlb0.local_TLB_hits                     774                       # Number of TLB hits
+system.l1_tlb0.local_TLB_misses                     4                       # Number of TLB misses
+system.l1_tlb0.local_TLB_miss_rate           0.514139                       # TLB miss rate
+system.l1_tlb0.global_TLB_accesses                778                       # Number of TLB accesses
+system.l1_tlb0.global_TLB_hits                    774                       # Number of TLB hits
+system.l1_tlb0.global_TLB_misses                    4                       # Number of TLB misses
+system.l1_tlb0.global_TLB_miss_rate          0.514139                       # TLB miss rate
+system.l1_tlb0.access_cycles                        0                       # Cycles spent accessing this TLB level
+system.l1_tlb0.page_table_cycles                    0                       # Cycles spent accessing the page table
+system.l1_tlb0.unique_pages                         4                       # Number of unique pages touched
+system.l1_tlb0.local_cycles                         0                       # Number of cycles spent in queue for all incoming reqs
+system.l1_tlb0.local_latency                        0                       # Avg. latency over incoming coalesced reqs
+system.l1_tlb0.avg_reuse_distance                   0                       # avg. reuse distance over all pages (in ticks)
+system.l1_tlb1.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l1_tlb1.clk_domain.clock                  1000                       # Clock period in ticks
+system.l1_tlb1.local_TLB_accesses                 769                       # Number of TLB accesses
+system.l1_tlb1.local_TLB_hits                     766                       # Number of TLB hits
+system.l1_tlb1.local_TLB_misses                     3                       # Number of TLB misses
+system.l1_tlb1.local_TLB_miss_rate           0.390117                       # TLB miss rate
+system.l1_tlb1.global_TLB_accesses                769                       # Number of TLB accesses
+system.l1_tlb1.global_TLB_hits                    766                       # Number of TLB hits
+system.l1_tlb1.global_TLB_misses                    3                       # Number of TLB misses
+system.l1_tlb1.global_TLB_miss_rate          0.390117                       # TLB miss rate
+system.l1_tlb1.access_cycles                        0                       # Cycles spent accessing this TLB level
+system.l1_tlb1.page_table_cycles                    0                       # Cycles spent accessing the page table
+system.l1_tlb1.unique_pages                         3                       # Number of unique pages touched
+system.l1_tlb1.local_cycles                         0                       # Number of cycles spent in queue for all incoming reqs
+system.l1_tlb1.local_latency                        0                       # Avg. latency over incoming coalesced reqs
+system.l1_tlb1.avg_reuse_distance                   0                       # avg. reuse distance over all pages (in ticks)
+system.l2_coalescer.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l2_coalescer.clk_domain.clock             1000                       # Clock period in ticks
+system.l2_coalescer.uncoalesced_accesses            8                       # Number of uncoalesced TLB accesses
+system.l2_coalescer.coalesced_accesses              1                       # Number of coalesced TLB accesses
+system.l2_coalescer.queuing_cycles               8000                       # Number of cycles spent in queue
+system.l2_coalescer.local_queuing_cycles         1000                       # Number of cycles spent in queue for all incoming reqs
+system.l2_coalescer.local_latency                 125                       # Avg. latency over all incoming pkts
+system.l2_tlb.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l2_tlb.clk_domain.clock                   1000                       # Clock period in ticks
+system.l2_tlb.local_TLB_accesses                    8                       # Number of TLB accesses
+system.l2_tlb.local_TLB_hits                        3                       # Number of TLB hits
+system.l2_tlb.local_TLB_misses                      5                       # Number of TLB misses
+system.l2_tlb.local_TLB_miss_rate           62.500000                       # TLB miss rate
+system.l2_tlb.global_TLB_accesses                  15                       # Number of TLB accesses
+system.l2_tlb.global_TLB_hits                       3                       # Number of TLB hits
+system.l2_tlb.global_TLB_misses                    12                       # Number of TLB misses
+system.l2_tlb.global_TLB_miss_rate                 80                       # TLB miss rate
+system.l2_tlb.access_cycles                    552008                       # Cycles spent accessing this TLB level
+system.l2_tlb.page_table_cycles                     0                       # Cycles spent accessing the page table
+system.l2_tlb.unique_pages                          5                       # Number of unique pages touched
+system.l2_tlb.local_cycles                      69001                       # Number of cycles spent in queue for all incoming reqs
+system.l2_tlb.local_latency               8625.125000                       # Avg. latency over incoming coalesced reqs
+system.l2_tlb.avg_reuse_distance                    0                       # avg. reuse distance over all pages (in ticks)
+system.l3_coalescer.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l3_coalescer.clk_domain.clock             1000                       # Clock period in ticks
+system.l3_coalescer.uncoalesced_accesses            5                       # Number of uncoalesced TLB accesses
+system.l3_coalescer.coalesced_accesses              1                       # Number of coalesced TLB accesses
+system.l3_coalescer.queuing_cycles               8000                       # Number of cycles spent in queue
+system.l3_coalescer.local_queuing_cycles         1000                       # Number of cycles spent in queue for all incoming reqs
+system.l3_coalescer.local_latency                 200                       # Avg. latency over all incoming pkts
+system.l3_tlb.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l3_tlb.clk_domain.clock                   1000                       # Clock period in ticks
+system.l3_tlb.local_TLB_accesses                    5                       # Number of TLB accesses
+system.l3_tlb.local_TLB_hits                        0                       # Number of TLB hits
+system.l3_tlb.local_TLB_misses                      5                       # Number of TLB misses
+system.l3_tlb.local_TLB_miss_rate                 100                       # TLB miss rate
+system.l3_tlb.global_TLB_accesses                  12                       # Number of TLB accesses
+system.l3_tlb.global_TLB_hits                       0                       # Number of TLB hits
+system.l3_tlb.global_TLB_misses                    12                       # Number of TLB misses
+system.l3_tlb.global_TLB_miss_rate                100                       # TLB miss rate
+system.l3_tlb.access_cycles                   1200000                       # Cycles spent accessing this TLB level
+system.l3_tlb.page_table_cycles               6000000                       # Cycles spent accessing the page table
+system.l3_tlb.unique_pages                          5                       # Number of unique pages touched
+system.l3_tlb.local_cycles                     150000                       # Number of cycles spent in queue for all incoming reqs
+system.l3_tlb.local_latency                     30000                       # Avg. latency over incoming coalesced reqs
+system.l3_tlb.avg_reuse_distance                    0                       # avg. reuse distance over all pages (in ticks)
+system.piobus.trans_dist::WriteReq                 94                       # Transaction distribution
+system.piobus.trans_dist::WriteResp                94                       # Transaction distribution
+system.piobus.pkt_count_system.cp_cntrl0.sequencer.mem-master-port::system.cpu2.pio          188                       # Packet count per connected master and slave (bytes)
+system.piobus.pkt_count::total                    188                       # Packet count per connected master and slave (bytes)
+system.piobus.pkt_size_system.cp_cntrl0.sequencer.mem-master-port::system.cpu2.pio          748                       # Cumulative packet size per connected master and slave (bytes)
+system.piobus.pkt_size::total                     748                       # Cumulative packet size per connected master and slave (bytes)
+system.piobus.reqLayer0.occupancy              188000                       # Layer occupancy (ticks)
+system.piobus.reqLayer0.utilization               0.0                       # Layer utilization (%)
+system.piobus.respLayer0.occupancy              94000                       # Layer occupancy (ticks)
+system.piobus.respLayer0.utilization              0.0                       # Layer utilization (%)
+system.ruby.network.ext_links0.int_node.percent_links_utilized     0.130525                      
+system.ruby.network.ext_links0.int_node.msg_count.Control::0            4                      
+system.ruby.network.ext_links0.int_node.msg_count.Data::0           18                      
+system.ruby.network.ext_links0.int_node.msg_count.Request_Control::0         1542                      
+system.ruby.network.ext_links0.int_node.msg_count.Response_Data::2         1546                      
+system.ruby.network.ext_links0.int_node.msg_count.Response_Control::2            2                      
+system.ruby.network.ext_links0.int_node.msg_count.Writeback_Control::2           16                      
+system.ruby.network.ext_links0.int_node.msg_count.Unblock_Control::4         1541                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Control::0           32                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Data::0         1296                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Request_Control::0        12336                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Response_Data::2       111312                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Response_Control::2           16                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Writeback_Control::2          128                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Unblock_Control::4        12328                      
+system.ruby.network.ext_links1.int_node.percent_links_utilized     0.192653                      
+system.ruby.network.ext_links1.int_node.msg_count.Control::0            3                      
+system.ruby.network.ext_links1.int_node.msg_count.Request_Control::0         1535                      
+system.ruby.network.ext_links1.int_node.msg_count.Response_Data::2         1537                      
+system.ruby.network.ext_links1.int_node.msg_count.Response_Control::2            1                      
+system.ruby.network.ext_links1.int_node.msg_count.Unblock_Control::4         1534                      
+system.ruby.network.ext_links1.int_node.msg_bytes.Control::0           24                      
+system.ruby.network.ext_links1.int_node.msg_bytes.Request_Control::0        12280                      
+system.ruby.network.ext_links1.int_node.msg_bytes.Response_Data::2       110664                      
+system.ruby.network.ext_links1.int_node.msg_bytes.Response_Control::2            8                      
+system.ruby.network.ext_links1.int_node.msg_bytes.Unblock_Control::4        12272                      
+system.tcp_cntrl0.L1cache.demand_hits               0                       # Number of cache demand hits
+system.tcp_cntrl0.L1cache.demand_misses             0                       # Number of cache demand misses
+system.tcp_cntrl0.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.tcp_cntrl0.L1cache.num_data_array_reads            6                       # number of data array reads
+system.tcp_cntrl0.L1cache.num_data_array_writes           11                       # number of data array writes
+system.tcp_cntrl0.L1cache.num_tag_array_reads         1297                       # number of tag array reads
+system.tcp_cntrl0.L1cache.num_tag_array_writes           11                       # number of tag array writes
+system.tcp_cntrl0.L1cache.num_tag_array_stalls         1271                       # number of stalls caused by tag array
+system.tcp_cntrl0.L1cache.num_data_array_stalls            2                       # number of stalls caused by data array
+system.tcp_cntrl0.coalescer.gpu_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl0.coalescer.gpu_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl0.coalescer.gpu_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl0.coalescer.gpu_ld_misses            5                       # loads that miss in the GPU
+system.tcp_cntrl0.coalescer.gpu_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl0.coalescer.gpu_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl0.coalescer.gpu_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl0.coalescer.gpu_st_misses            9                       # stores that miss in the GPU
+system.tcp_cntrl0.coalescer.cp_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl0.coalescer.cp_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl0.coalescer.cp_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl0.coalescer.cp_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl0.coalescer.cp_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl0.coalescer.cp_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl0.coalescer.cp_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl0.coalescer.cp_st_misses            0                       # stores that miss in the GPU
+system.ruby.network.ext_links2.int_node.percent_links_utilized     0.002557                      
+system.ruby.network.ext_links2.int_node.msg_count.Control::0            1                      
+system.ruby.network.ext_links2.int_node.msg_count.Data::0           18                      
+system.ruby.network.ext_links2.int_node.msg_count.Data::1           18                      
+system.ruby.network.ext_links2.int_node.msg_count.Request_Control::0            7                      
+system.ruby.network.ext_links2.int_node.msg_count.Request_Control::1            9                      
+system.ruby.network.ext_links2.int_node.msg_count.Response_Data::2            9                      
+system.ruby.network.ext_links2.int_node.msg_count.Response_Data::3           11                      
+system.ruby.network.ext_links2.int_node.msg_count.Response_Control::2            1                      
+system.ruby.network.ext_links2.int_node.msg_count.Writeback_Control::2           16                      
+system.ruby.network.ext_links2.int_node.msg_count.Writeback_Control::3           16                      
+system.ruby.network.ext_links2.int_node.msg_count.Unblock_Control::4            7                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Control::0            8                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Data::0         1296                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Data::1         1296                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::0           56                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::1           72                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Response_Data::2          648                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Response_Data::3          792                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Response_Control::2            8                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Writeback_Control::2          128                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Writeback_Control::3          128                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Unblock_Control::4           56                      
+system.tcp_cntrl1.L1cache.demand_hits               0                       # Number of cache demand hits
+system.tcp_cntrl1.L1cache.demand_misses             0                       # Number of cache demand misses
+system.tcp_cntrl1.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.tcp_cntrl1.L1cache.num_data_array_reads            6                       # number of data array reads
+system.tcp_cntrl1.L1cache.num_data_array_writes           11                       # number of data array writes
+system.tcp_cntrl1.L1cache.num_tag_array_reads         1297                       # number of tag array reads
+system.tcp_cntrl1.L1cache.num_tag_array_writes           11                       # number of tag array writes
+system.tcp_cntrl1.L1cache.num_tag_array_stalls         1271                       # number of stalls caused by tag array
+system.tcp_cntrl1.L1cache.num_data_array_stalls            2                       # number of stalls caused by data array
+system.tcp_cntrl1.coalescer.gpu_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl1.coalescer.gpu_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl1.coalescer.gpu_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl1.coalescer.gpu_ld_misses            5                       # loads that miss in the GPU
+system.tcp_cntrl1.coalescer.gpu_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl1.coalescer.gpu_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl1.coalescer.gpu_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl1.coalescer.gpu_st_misses            9                       # stores that miss in the GPU
+system.tcp_cntrl1.coalescer.cp_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl1.coalescer.cp_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl1.coalescer.cp_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl1.coalescer.cp_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl1.coalescer.cp_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl1.coalescer.cp_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl1.coalescer.cp_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl1.coalescer.cp_st_misses            0                       # stores that miss in the GPU
+system.sqc_cntrl0.L1cache.demand_hits               0                       # Number of cache demand hits
+system.sqc_cntrl0.L1cache.demand_misses             0                       # Number of cache demand misses
+system.sqc_cntrl0.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.sqc_cntrl0.L1cache.num_data_array_reads           86                       # number of data array reads
+system.sqc_cntrl0.L1cache.num_tag_array_reads           91                       # number of tag array reads
+system.sqc_cntrl0.L1cache.num_tag_array_writes           10                       # number of tag array writes
+system.sqc_cntrl0.sequencer.load_waiting_on_load           98                       # Number of times a load aliased with a pending load
+system.tcc_cntrl0.L2cache.demand_hits               0                       # Number of cache demand hits
+system.tcc_cntrl0.L2cache.demand_misses             0                       # Number of cache demand misses
+system.tcc_cntrl0.L2cache.demand_accesses            0                       # Number of cache demand accesses
+system.tcc_cntrl0.L2cache.num_data_array_writes            9                       # number of data array writes
+system.tcc_cntrl0.L2cache.num_tag_array_reads           35                       # number of tag array reads
+system.tcc_cntrl0.L2cache.num_tag_array_writes           11                       # number of tag array writes
+system.ruby.network.msg_count.Control               8                      
+system.ruby.network.msg_count.Data                 54                      
+system.ruby.network.msg_count.Request_Control         3093                      
+system.ruby.network.msg_count.Response_Data         3103                      
+system.ruby.network.msg_count.Response_Control            4                      
+system.ruby.network.msg_count.Writeback_Control           48                      
+system.ruby.network.msg_count.Unblock_Control         3082                      
+system.ruby.network.msg_byte.Control               64                      
+system.ruby.network.msg_byte.Data                3888                      
+system.ruby.network.msg_byte.Request_Control        24744                      
+system.ruby.network.msg_byte.Response_Data       223416                      
+system.ruby.network.msg_byte.Response_Control           32                      
+system.ruby.network.msg_byte.Writeback_Control          384                      
+system.ruby.network.msg_byte.Unblock_Control        24656                      
+system.sqc_coalescer.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.sqc_coalescer.clk_domain.clock            1000                       # Clock period in ticks
+system.sqc_coalescer.uncoalesced_accesses           86                       # Number of uncoalesced TLB accesses
+system.sqc_coalescer.coalesced_accesses            66                       # Number of coalesced TLB accesses
+system.sqc_coalescer.queuing_cycles            288000                       # Number of cycles spent in queue
+system.sqc_coalescer.local_queuing_cycles       288000                       # Number of cycles spent in queue for all incoming reqs
+system.sqc_coalescer.local_latency        3348.837209                       # Avg. latency over all incoming pkts
+system.sqc_tlb.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.sqc_tlb.clk_domain.clock                  1000                       # Clock period in ticks
+system.sqc_tlb.local_TLB_accesses                  66                       # Number of TLB accesses
+system.sqc_tlb.local_TLB_hits                      65                       # Number of TLB hits
+system.sqc_tlb.local_TLB_misses                     1                       # Number of TLB misses
+system.sqc_tlb.local_TLB_miss_rate           1.515152                       # TLB miss rate
+system.sqc_tlb.global_TLB_accesses                 86                       # Number of TLB accesses
+system.sqc_tlb.global_TLB_hits                     78                       # Number of TLB hits
+system.sqc_tlb.global_TLB_misses                    8                       # Number of TLB misses
+system.sqc_tlb.global_TLB_miss_rate          9.302326                       # TLB miss rate
+system.sqc_tlb.access_cycles                    86008                       # Cycles spent accessing this TLB level
+system.sqc_tlb.page_table_cycles                    0                       # Cycles spent accessing the page table
+system.sqc_tlb.unique_pages                         1                       # Number of unique pages touched
+system.sqc_tlb.local_cycles                     66001                       # Number of cycles spent in queue for all incoming reqs
+system.sqc_tlb.local_latency              1000.015152                       # Avg. latency over incoming coalesced reqs
+system.sqc_tlb.avg_reuse_distance                   0                       # avg. reuse distance over all pages (in ticks)
+system.ruby.network.ext_links0.int_node.throttle0.link_utilization     0.074413                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Data::0           18                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Request_Control::0         1542                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Response_Data::2            2                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Response_Control::2            2                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Unblock_Control::4         1541                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Data::0         1296                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Request_Control::0        12336                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Response_Data::2          144                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Response_Control::2           16                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Unblock_Control::4        12328                      
+system.ruby.network.ext_links0.int_node.throttle1.link_utilization     0.314928                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_count.Control::0            3                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_count.Response_Data::2         1535                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Control::0           24                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Response_Data::2       110520                      
+system.ruby.network.ext_links0.int_node.throttle2.link_utilization     0.002234                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_count.Control::0            1                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_count.Response_Data::2            9                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_count.Writeback_Control::2           16                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Control::0            8                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Response_Data::2          648                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Writeback_Control::2          128                      
+system.ruby.network.ext_links1.int_node.throttle0.link_utilization     0.314928                      
+system.ruby.network.ext_links1.int_node.throttle0.msg_count.Control::0            3                      
+system.ruby.network.ext_links1.int_node.throttle0.msg_count.Response_Data::2         1535                      
+system.ruby.network.ext_links1.int_node.throttle0.msg_bytes.Control::0           24                      
+system.ruby.network.ext_links1.int_node.throttle0.msg_bytes.Response_Data::2       110520                      
+system.ruby.network.ext_links1.int_node.throttle1.link_utilization     0.070379                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_count.Request_Control::0         1535                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_count.Response_Data::2            2                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_count.Response_Control::2            1                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_count.Unblock_Control::4         1534                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Request_Control::0        12280                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Response_Data::2          144                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Response_Control::2            8                      
+system.ruby.network.ext_links1.int_node.throttle1.msg_bytes.Unblock_Control::4        12272                      
+system.ruby.network.ext_links2.int_node.throttle0.link_utilization     0.000798                      
+system.ruby.network.ext_links2.int_node.throttle0.msg_count.Response_Data::3            3                      
+system.ruby.network.ext_links2.int_node.throttle0.msg_count.Writeback_Control::3            8                      
+system.ruby.network.ext_links2.int_node.throttle0.msg_bytes.Response_Data::3          216                      
+system.ruby.network.ext_links2.int_node.throttle0.msg_bytes.Writeback_Control::3           64                      
+system.ruby.network.ext_links2.int_node.throttle1.link_utilization     0.000798                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_count.Response_Data::3            3                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_count.Writeback_Control::3            8                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Response_Data::3          216                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Writeback_Control::3           64                      
+system.ruby.network.ext_links2.int_node.throttle2.link_utilization     0.006131                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_count.Control::0            1                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_count.Data::1           18                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_count.Request_Control::1            9                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_count.Response_Data::2            9                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_count.Writeback_Control::2           16                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Control::0            8                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Data::1         1296                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Request_Control::1           72                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Response_Data::2          648                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Writeback_Control::2          128                      
+system.ruby.network.ext_links2.int_node.throttle3.link_utilization     0.001026                      
+system.ruby.network.ext_links2.int_node.throttle3.msg_count.Response_Data::3            5                      
+system.ruby.network.ext_links2.int_node.throttle3.msg_bytes.Response_Data::3          360                      
+system.ruby.network.ext_links2.int_node.throttle4.link_utilization     0.004034                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_count.Data::0           18                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_count.Request_Control::0            7                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_count.Response_Control::2            1                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_count.Unblock_Control::4            7                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Data::0         1296                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Request_Control::0           56                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Response_Control::2            8                      
+system.ruby.network.ext_links2.int_node.throttle4.msg_bytes.Unblock_Control::4           56                      
+system.ruby.CorePair_Controller.C0_Load_L1miss          180      0.00%      0.00%
+system.ruby.CorePair_Controller.C0_Load_L1hit        16155      0.00%      0.00%
+system.ruby.CorePair_Controller.Ifetch0_L1hit        86007      0.00%      0.00%
+system.ruby.CorePair_Controller.Ifetch0_L1miss         1088      0.00%      0.00%
+system.ruby.CorePair_Controller.C0_Store_L1miss          325      0.00%      0.00%
+system.ruby.CorePair_Controller.C0_Store_L1hit        10448      0.00%      0.00%
+system.ruby.CorePair_Controller.NB_AckS          1034      0.00%      0.00%
+system.ruby.CorePair_Controller.NB_AckM           326      0.00%      0.00%
+system.ruby.CorePair_Controller.NB_AckE           175      0.00%      0.00%
+system.ruby.CorePair_Controller.L1I_Repl          589      0.00%      0.00%
+system.ruby.CorePair_Controller.L1D0_Repl           24      0.00%      0.00%
+system.ruby.CorePair_Controller.L2_to_L1D0            5      0.00%      0.00%
+system.ruby.CorePair_Controller.L2_to_L1I           54      0.00%      0.00%
+system.ruby.CorePair_Controller.PrbInvData            1      0.00%      0.00%
+system.ruby.CorePair_Controller.PrbShrData            2      0.00%      0.00%
+system.ruby.CorePair_Controller.I.C0_Load_L1miss          175      0.00%      0.00%
+system.ruby.CorePair_Controller.I.Ifetch0_L1miss         1034      0.00%      0.00%
+system.ruby.CorePair_Controller.I.C0_Store_L1miss          325      0.00%      0.00%
+system.ruby.CorePair_Controller.S.Ifetch0_L1hit        86007      0.00%      0.00%
+system.ruby.CorePair_Controller.S.Ifetch0_L1miss           54      0.00%      0.00%
+system.ruby.CorePair_Controller.S.L1I_Repl          589      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.C0_Load_L1miss            2      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.C0_Load_L1hit         3356      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.C0_Store_L1hit           46      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.L1D0_Repl           16      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.PrbShrData            1      0.00%      0.00%
+system.ruby.CorePair_Controller.O.C0_Load_L1hit            3      0.00%      0.00%
+system.ruby.CorePair_Controller.O.C0_Store_L1hit            1      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.C0_Load_L1miss            3      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.C0_Load_L1hit        12796      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.C0_Store_L1hit        10401      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.L1D0_Repl            8      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.PrbInvData            1      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.PrbShrData            1      0.00%      0.00%
+system.ruby.CorePair_Controller.I_M0.NB_AckM          325      0.00%      0.00%
+system.ruby.CorePair_Controller.I_E0S.NB_AckE          175      0.00%      0.00%
+system.ruby.CorePair_Controller.Si_F0.L2_to_L1I           54      0.00%      0.00%
+system.ruby.CorePair_Controller.O_M0.NB_AckM            1      0.00%      0.00%
+system.ruby.CorePair_Controller.S0.NB_AckS         1034      0.00%      0.00%
+system.ruby.CorePair_Controller.E0_F.L2_to_L1D0            2      0.00%      0.00%
+system.ruby.CorePair_Controller.M0_F.L2_to_L1D0            3      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlkS          1034      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlkM           326      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlk            182      0.00%      0.00%
+system.ruby.Directory_Controller.WriteThrough           16      0.00%      0.00%
+system.ruby.Directory_Controller.Atomic             3      0.00%      0.00%
+system.ruby.Directory_Controller.CPUPrbResp            4      0.00%      0.00%
+system.ruby.Directory_Controller.ProbeAcksComplete         1560      0.00%      0.00%
+system.ruby.Directory_Controller.MemData         1560      0.00%      0.00%
+system.ruby.Directory_Controller.CoreUnblock         1541      0.00%      0.00%
+system.ruby.Directory_Controller.UnblockWriteThrough           18      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlkS         1034      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlkM          326      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlk          182      0.00%      0.00%
+system.ruby.Directory_Controller.U.WriteThrough           16      0.00%      0.00%
+system.ruby.Directory_Controller.U.Atomic            2      0.00%      0.00%
+system.ruby.Directory_Controller.BS_M.MemData         1034      0.00%      0.00%
+system.ruby.Directory_Controller.BM_M.MemData          343      0.00%      0.00%
+system.ruby.Directory_Controller.B_M.MemData          180      0.00%      0.00%
+system.ruby.Directory_Controller.BS_PM.ProbeAcksComplete         1034      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.Atomic            1      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.CPUPrbResp            1      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.ProbeAcksComplete          343      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.MemData            1      0.00%      0.00%
+system.ruby.Directory_Controller.B_PM.ProbeAcksComplete          180      0.00%      0.00%
+system.ruby.Directory_Controller.B_PM.MemData            2      0.00%      0.00%
+system.ruby.Directory_Controller.BM_Pm.CPUPrbResp            1      0.00%      0.00%
+system.ruby.Directory_Controller.BM_Pm.ProbeAcksComplete            1      0.00%      0.00%
+system.ruby.Directory_Controller.B_Pm.CPUPrbResp            2      0.00%      0.00%
+system.ruby.Directory_Controller.B_Pm.ProbeAcksComplete            2      0.00%      0.00%
+system.ruby.Directory_Controller.B.CoreUnblock         1541      0.00%      0.00%
+system.ruby.Directory_Controller.B.UnblockWriteThrough           18      0.00%      0.00%
+system.ruby.LD.latency_hist::bucket_size           64                      
+system.ruby.LD.latency_hist::max_bucket           639                      
+system.ruby.LD.latency_hist::samples            16335                      
+system.ruby.LD.latency_hist::mean            3.253444                      
+system.ruby.LD.latency_hist::gmean           1.059859                      
+system.ruby.LD.latency_hist::stdev          21.887471                      
+system.ruby.LD.latency_hist              |       16160     98.93%     98.93% |           0      0.00%     98.93% |           0      0.00%     98.93% |         170      1.04%     99.97% |           1      0.01%     99.98% |           1      0.01%     99.98% |           2      0.01%     99.99% |           1      0.01%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.latency_hist::total              16335                      
+system.ruby.LD.hit_latency_hist::bucket_size           64                      
+system.ruby.LD.hit_latency_hist::max_bucket          639                      
+system.ruby.LD.hit_latency_hist::samples          175                      
+system.ruby.LD.hit_latency_hist::mean      210.828571                      
+system.ruby.LD.hit_latency_hist::gmean     209.031405                      
+system.ruby.LD.hit_latency_hist::stdev      34.022715                      
+system.ruby.LD.hit_latency_hist          |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |         170     97.14%     97.14% |           1      0.57%     97.71% |           1      0.57%     98.29% |           2      1.14%     99.43% |           1      0.57%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.hit_latency_hist::total            175                      
+system.ruby.LD.miss_latency_hist::bucket_size            2                      
+system.ruby.LD.miss_latency_hist::max_bucket           19                      
+system.ruby.LD.miss_latency_hist::samples        16160                      
+system.ruby.LD.miss_latency_hist::mean       1.005569                      
+system.ruby.LD.miss_latency_hist::gmean      1.000911                      
+system.ruby.LD.miss_latency_hist::stdev      0.316580                      
+system.ruby.LD.miss_latency_hist         |       16155     99.97%     99.97% |           0      0.00%     99.97% |           0      0.00%     99.97% |           0      0.00%     99.97% |           0      0.00%     99.97% |           0      0.00%     99.97% |           0      0.00%     99.97% |           0      0.00%     99.97% |           0      0.00%     99.97% |           5      0.03%    100.00%
+system.ruby.LD.miss_latency_hist::total         16160                      
+system.ruby.ST.latency_hist::bucket_size           64                      
+system.ruby.ST.latency_hist::max_bucket           639                      
+system.ruby.ST.latency_hist::samples            10412                      
+system.ruby.ST.latency_hist::mean            7.384076                      
+system.ruby.ST.latency_hist::gmean           1.178989                      
+system.ruby.ST.latency_hist::stdev          36.341010                      
+system.ruby.ST.latency_hist              |       10090     96.91%     96.91% |           0      0.00%     96.91% |           0      0.00%     96.91% |         309      2.97%     99.88% |           4      0.04%     99.91% |           2      0.02%     99.93% |           3      0.03%     99.96% |           4      0.04%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.latency_hist::total              10412                      
+system.ruby.ST.hit_latency_hist::bucket_size           64                      
+system.ruby.ST.hit_latency_hist::max_bucket          639                      
+system.ruby.ST.hit_latency_hist::samples          322                      
+system.ruby.ST.hit_latency_hist::mean      207.431677                      
+system.ruby.ST.hit_latency_hist::gmean     205.258691                      
+system.ruby.ST.hit_latency_hist::stdev      37.529677                      
+system.ruby.ST.hit_latency_hist          |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |         309     95.96%     95.96% |           4      1.24%     97.20% |           2      0.62%     97.83% |           3      0.93%     98.76% |           4      1.24%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.hit_latency_hist::total            322                      
+system.ruby.ST.miss_latency_hist::bucket_size            1                      
+system.ruby.ST.miss_latency_hist::max_bucket            9                      
+system.ruby.ST.miss_latency_hist::samples        10090                      
+system.ruby.ST.miss_latency_hist::mean              1                      
+system.ruby.ST.miss_latency_hist::gmean             1                      
+system.ruby.ST.miss_latency_hist         |           0      0.00%      0.00% |       10090    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.miss_latency_hist::total         10090                      
+system.ruby.IFETCH.latency_hist::bucket_size           64                      
+system.ruby.IFETCH.latency_hist::max_bucket          639                      
+system.ruby.IFETCH.latency_hist::samples        87095                      
+system.ruby.IFETCH.latency_hist::mean        3.432677                      
+system.ruby.IFETCH.latency_hist::gmean       1.067087                      
+system.ruby.IFETCH.latency_hist::stdev      22.344689                      
+system.ruby.IFETCH.latency_hist          |       86061     98.81%     98.81% |           0      0.00%     98.81% |           0      0.00%     98.81% |        1006      1.16%     99.97% |           5      0.01%     99.97% |          10      0.01%     99.99% |          11      0.01%    100.00% |           2      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.latency_hist::total          87095                      
+system.ruby.IFETCH.hit_latency_hist::bucket_size           64                      
+system.ruby.IFETCH.hit_latency_hist::max_bucket          639                      
+system.ruby.IFETCH.hit_latency_hist::samples         1034                      
+system.ruby.IFETCH.hit_latency_hist::mean   204.967118                      
+system.ruby.IFETCH.hit_latency_hist::gmean   203.475698                      
+system.ruby.IFETCH.hit_latency_hist::stdev    30.573589                      
+system.ruby.IFETCH.hit_latency_hist      |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |        1006     97.29%     97.29% |           5      0.48%     97.78% |          10      0.97%     98.74% |          11      1.06%     99.81% |           2      0.19%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.hit_latency_hist::total         1034                      
+system.ruby.IFETCH.miss_latency_hist::bucket_size            2                      
+system.ruby.IFETCH.miss_latency_hist::max_bucket           19                      
+system.ruby.IFETCH.miss_latency_hist::samples        86061                      
+system.ruby.IFETCH.miss_latency_hist::mean     1.011294                      
+system.ruby.IFETCH.miss_latency_hist::gmean     1.001849                      
+system.ruby.IFETCH.miss_latency_hist::stdev     0.450747                      
+system.ruby.IFETCH.miss_latency_hist     |       86007     99.94%     99.94% |           0      0.00%     99.94% |           0      0.00%     99.94% |           0      0.00%     99.94% |           0      0.00%     99.94% |           0      0.00%     99.94% |           0      0.00%     99.94% |           0      0.00%     99.94% |           0      0.00%     99.94% |          54      0.06%    100.00%
+system.ruby.IFETCH.miss_latency_hist::total        86061                      
+system.ruby.RMW_Read.latency_hist::bucket_size           32                      
+system.ruby.RMW_Read.latency_hist::max_bucket          319                      
+system.ruby.RMW_Read.latency_hist::samples          341                      
+system.ruby.RMW_Read.latency_hist::mean      3.451613                      
+system.ruby.RMW_Read.latency_hist::gmean     1.064718                      
+system.ruby.RMW_Read.latency_hist::stdev    22.561449                      
+system.ruby.RMW_Read.latency_hist        |         337     98.83%     98.83% |           0      0.00%     98.83% |           0      0.00%     98.83% |           0      0.00%     98.83% |           0      0.00%     98.83% |           0      0.00%     98.83% |           3      0.88%     99.71% |           1      0.29%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RMW_Read.latency_hist::total          341                      
+system.ruby.RMW_Read.hit_latency_hist::bucket_size           32                      
+system.ruby.RMW_Read.hit_latency_hist::max_bucket          319                      
+system.ruby.RMW_Read.hit_latency_hist::samples            4                      
+system.ruby.RMW_Read.hit_latency_hist::mean          210                      
+system.ruby.RMW_Read.hit_latency_hist::gmean   209.766277                      
+system.ruby.RMW_Read.hit_latency_hist::stdev    11.430952                      
+system.ruby.RMW_Read.hit_latency_hist    |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           3     75.00%     75.00% |           1     25.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RMW_Read.hit_latency_hist::total            4                      
+system.ruby.RMW_Read.miss_latency_hist::bucket_size            1                      
+system.ruby.RMW_Read.miss_latency_hist::max_bucket            9                      
+system.ruby.RMW_Read.miss_latency_hist::samples          337                      
+system.ruby.RMW_Read.miss_latency_hist::mean            1                      
+system.ruby.RMW_Read.miss_latency_hist::gmean            1                      
+system.ruby.RMW_Read.miss_latency_hist   |           0      0.00%      0.00% |         337    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RMW_Read.miss_latency_hist::total          337                      
+system.ruby.Locked_RMW_Read.latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Read.latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Read.latency_hist::samples           10                      
+system.ruby.Locked_RMW_Read.latency_hist::mean            1                      
+system.ruby.Locked_RMW_Read.latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Read.latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Read.latency_hist::total           10                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::samples           10                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::mean            1                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Read.miss_latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Read.miss_latency_hist::total           10                      
+system.ruby.Locked_RMW_Write.latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Write.latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Write.latency_hist::samples           10                      
+system.ruby.Locked_RMW_Write.latency_hist::mean            1                      
+system.ruby.Locked_RMW_Write.latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Write.latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Write.latency_hist::total           10                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::samples           10                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::mean            1                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Write.miss_latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Write.miss_latency_hist::total           10                      
+system.ruby.L1Cache.miss_mach_latency_hist::bucket_size            1                      
+system.ruby.L1Cache.miss_mach_latency_hist::max_bucket            9                      
+system.ruby.L1Cache.miss_mach_latency_hist::samples       112609                      
+system.ruby.L1Cache.miss_mach_latency_hist::mean            1                      
+system.ruby.L1Cache.miss_mach_latency_hist::gmean            1                      
+system.ruby.L1Cache.miss_mach_latency_hist |           0      0.00%      0.00% |      112609    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.L1Cache.miss_mach_latency_hist::total       112609                      
+system.ruby.L2Cache.miss_mach_latency_hist::bucket_size            2                      
+system.ruby.L2Cache.miss_mach_latency_hist::max_bucket           19                      
+system.ruby.L2Cache.miss_mach_latency_hist::samples           59                      
+system.ruby.L2Cache.miss_mach_latency_hist::mean           19                      
+system.ruby.L2Cache.miss_mach_latency_hist::gmean    19.000000                      
+system.ruby.L2Cache.miss_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |          59    100.00%    100.00%
+system.ruby.L2Cache.miss_mach_latency_hist::total           59                      
+system.ruby.Directory.hit_mach_latency_hist::bucket_size           64                      
+system.ruby.Directory.hit_mach_latency_hist::max_bucket          639                      
+system.ruby.Directory.hit_mach_latency_hist::samples         1535                      
+system.ruby.Directory.hit_mach_latency_hist::mean   206.165472                      
+system.ruby.Directory.hit_mach_latency_hist::gmean   204.491657                      
+system.ruby.Directory.hit_mach_latency_hist::stdev    32.551053                      
+system.ruby.Directory.hit_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |        1489     97.00%     97.00% |          10      0.65%     97.65% |          13      0.85%     98.50% |          16      1.04%     99.54% |           7      0.46%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Directory.hit_mach_latency_hist::total         1535                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::samples        16155                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |       16155    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::total        16155                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::bucket_size            2                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::max_bucket           19                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::samples            5                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::mean           19                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::gmean    19.000000                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           5    100.00%    100.00%
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::total            5                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::bucket_size           64                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::max_bucket          639                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::samples          175                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::mean   210.828571                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::gmean   209.031405                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::stdev    34.022715                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |         170     97.14%     97.14% |           1      0.57%     97.71% |           1      0.57%     98.29% |           2      1.14%     99.43% |           1      0.57%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.Directory.hit_type_mach_latency_hist::total          175                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::samples        10090                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |       10090    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::total        10090                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::bucket_size           64                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::max_bucket          639                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::samples          322                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::mean   207.431677                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::gmean   205.258691                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::stdev    37.529677                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |         309     95.96%     95.96% |           4      1.24%     97.20% |           2      0.62%     97.83% |           3      0.93%     98.76% |           4      1.24%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.Directory.hit_type_mach_latency_hist::total          322                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::samples        86007                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |       86007    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::total        86007                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::bucket_size            2                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::max_bucket           19                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::samples           54                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::mean           19                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::gmean    19.000000                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |          54    100.00%    100.00%
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::total           54                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::bucket_size           64                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::max_bucket          639                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::samples         1034                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::mean   204.967118                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::gmean   203.475698                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::stdev    30.573589                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |        1006     97.29%     97.29% |           5      0.48%     97.78% |          10      0.97%     98.74% |          11      1.06%     99.81% |           2      0.19%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::total         1034                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::samples          337                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |         337    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::total          337                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::bucket_size           32                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::max_bucket          319                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::samples            4                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::mean          210                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::gmean   209.766277                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::stdev    11.430952                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           3     75.00%     75.00% |           1     25.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::total            4                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::samples           10                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::total           10                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::samples           10                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::total           10                      
+system.ruby.SQC_Controller.Fetch                   86      0.00%      0.00%
+system.ruby.SQC_Controller.Data                     5      0.00%      0.00%
+system.ruby.SQC_Controller.I.Fetch                  5      0.00%      0.00%
+system.ruby.SQC_Controller.I.Data                   5      0.00%      0.00%
+system.ruby.SQC_Controller.V.Fetch                 81      0.00%      0.00%
+system.ruby.TCC_Controller.RdBlk                    9      0.00%      0.00%
+system.ruby.TCC_Controller.WrVicBlk                16      0.00%      0.00%
+system.ruby.TCC_Controller.Atomic                   2      0.00%      0.00%
+system.ruby.TCC_Controller.AtomicDone               1      0.00%      0.00%
+system.ruby.TCC_Controller.Data                     9      0.00%      0.00%
+system.ruby.TCC_Controller.PrbInv                   1      0.00%      0.00%
+system.ruby.TCC_Controller.WBAck                   16      0.00%      0.00%
+system.ruby.TCC_Controller.V.PrbInv                 1      0.00%      0.00%
+system.ruby.TCC_Controller.I.RdBlk                  7      0.00%      0.00%
+system.ruby.TCC_Controller.I.WrVicBlk              16      0.00%      0.00%
+system.ruby.TCC_Controller.I.Atomic                 1      0.00%      0.00%
+system.ruby.TCC_Controller.I.WBAck                 16      0.00%      0.00%
+system.ruby.TCC_Controller.IV.RdBlk                 2      0.00%      0.00%
+system.ruby.TCC_Controller.IV.Data                  7      0.00%      0.00%
+system.ruby.TCC_Controller.A.Atomic                 1      0.00%      0.00%
+system.ruby.TCC_Controller.A.AtomicDone             1      0.00%      0.00%
+system.ruby.TCC_Controller.A.Data                   2      0.00%      0.00%
+system.ruby.TCP_Controller.Load          |           5     50.00%     50.00% |           5     50.00%    100.00%
+system.ruby.TCP_Controller.Load::total             10                      
+system.ruby.TCP_Controller.StoreThrough  |           8     50.00%     50.00% |           8     50.00%    100.00%
+system.ruby.TCP_Controller.StoreThrough::total           16                      
+system.ruby.TCP_Controller.Atomic        |           1     50.00%     50.00% |           1     50.00%    100.00%
+system.ruby.TCP_Controller.Atomic::total            2                      
+system.ruby.TCP_Controller.Flush         |         768     50.00%     50.00% |         768     50.00%    100.00%
+system.ruby.TCP_Controller.Flush::total          1536                      
+system.ruby.TCP_Controller.Evict         |         512     50.00%     50.00% |         512     50.00%    100.00%
+system.ruby.TCP_Controller.Evict::total          1024                      
+system.ruby.TCP_Controller.TCC_Ack       |           3     50.00%     50.00% |           3     50.00%    100.00%
+system.ruby.TCP_Controller.TCC_Ack::total            6                      
+system.ruby.TCP_Controller.TCC_AckWB     |           8     50.00%     50.00% |           8     50.00%    100.00%
+system.ruby.TCP_Controller.TCC_AckWB::total           16                      
+system.ruby.TCP_Controller.I.Load        |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.TCP_Controller.I.Load::total            4                      
+system.ruby.TCP_Controller.I.StoreThrough |           8     50.00%     50.00% |           8     50.00%    100.00%
+system.ruby.TCP_Controller.I.StoreThrough::total           16                      
+system.ruby.TCP_Controller.I.Atomic      |           1     50.00%     50.00% |           1     50.00%    100.00%
+system.ruby.TCP_Controller.I.Atomic::total            2                      
+system.ruby.TCP_Controller.I.Flush       |         766     50.00%     50.00% |         766     50.00%    100.00%
+system.ruby.TCP_Controller.I.Flush::total         1532                      
+system.ruby.TCP_Controller.I.Evict       |         510     50.00%     50.00% |         510     50.00%    100.00%
+system.ruby.TCP_Controller.I.Evict::total         1020                      
+system.ruby.TCP_Controller.I.TCC_Ack     |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.TCP_Controller.I.TCC_Ack::total            4                      
+system.ruby.TCP_Controller.I.TCC_AckWB   |           8     50.00%     50.00% |           8     50.00%    100.00%
+system.ruby.TCP_Controller.I.TCC_AckWB::total           16                      
+system.ruby.TCP_Controller.V.Load        |           3     50.00%     50.00% |           3     50.00%    100.00%
+system.ruby.TCP_Controller.V.Load::total            6                      
+system.ruby.TCP_Controller.V.Flush       |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.TCP_Controller.V.Flush::total            4                      
+system.ruby.TCP_Controller.V.Evict       |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.TCP_Controller.V.Evict::total            4                      
+system.ruby.TCP_Controller.A.TCC_Ack     |           1     50.00%     50.00% |           1     50.00%    100.00%
+system.ruby.TCP_Controller.A.TCC_Ack::total            2                      
+
+---------- End Simulation Statistics   ----------
diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/config.ini b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/config.ini
new file mode 100644
index 000000000..38646dce2
--- /dev/null
+++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/config.ini
@@ -0,0 +1,5094 @@
+[root]
+type=Root
+children=system
+eventq_index=0
+full_system=false
+sim_quantum=0
+time_sync_enable=false
+time_sync_period=100000000000
+time_sync_spin_threshold=100000000
+
+[system]
+type=System
+children=clk_domain cp_cntrl0 cpu0 cpu1 cpu2 dir_cntrl0 dispatcher_coalescer dispatcher_tlb dvfs_handler l1_coalescer0 l1_coalescer1 l1_tlb0 l1_tlb1 l2_coalescer l2_tlb l3_coalescer l3_tlb mem_ctrls piobus rb_cntrl0 reg_cntrl0 ruby sqc_cntrl0 sqc_coalescer sqc_tlb sys_port_proxy tcc_cntrl0 tcc_rb_cntrl0 tcp_cntrl0 tcp_cntrl1 voltage_domain
+boot_osflags=a
+cache_line_size=64
+clk_domain=system.clk_domain
+eventq_index=0
+exit_on_work_items=false
+init_param=0
+kernel=
+kernel_addr_check=true
+load_addr_mask=1099511627775
+load_offset=0
+mem_mode=timing
+mem_ranges=0:536870911
+memories=system.mem_ctrls system.ruby.phys_mem
+mmap_using_noreserve=false
+multi_thread=false
+num_work_ids=16
+readfile=
+symbolfile=
+work_begin_ckpt_count=0
+work_begin_cpu_id_exit=-1
+work_begin_exit_count=0
+work_cpus_ckpt_count=0
+work_end_ckpt_count=0
+work_end_exit_count=0
+work_item_id=-1
+system_port=system.sys_port_proxy.slave[0]
+
+[system.clk_domain]
+type=SrcClockDomain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.voltage_domain
+
+[system.cp_cntrl0]
+type=CorePair_Controller
+children=L1D0cache L1D1cache L1Icache L2cache mandatoryQueue probeToCore requestFromCore responseFromCore responseToCore sequencer sequencer1 triggerQueue unblockFromCore
+L1D0cache=system.cp_cntrl0.L1D0cache
+L1D1cache=system.cp_cntrl0.L1D1cache
+L1Icache=system.cp_cntrl0.L1Icache
+L2cache=system.cp_cntrl0.L2cache
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+eventq_index=0
+issue_latency=1
+l2_hit_latency=18
+mandatoryQueue=system.cp_cntrl0.mandatoryQueue
+number_of_TBEs=256
+probeToCore=system.cp_cntrl0.probeToCore
+recycle_latency=10
+regionBufferNum=0
+requestFromCore=system.cp_cntrl0.requestFromCore
+responseFromCore=system.cp_cntrl0.responseFromCore
+responseToCore=system.cp_cntrl0.responseToCore
+ruby_system=system.ruby
+send_evictions=true
+sequencer=system.cp_cntrl0.sequencer
+sequencer1=system.cp_cntrl0.sequencer1
+system=system
+transitions_per_cycle=32
+triggerQueue=system.cp_cntrl0.triggerQueue
+unblockFromCore=system.cp_cntrl0.unblockFromCore
+version=0
+
+[system.cp_cntrl0.L1D0cache]
+type=RubyCache
+children=replacement_policy
+assoc=2
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=2
+eventq_index=0
+is_icache=false
+replacement_policy=system.cp_cntrl0.L1D0cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=65536
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=2
+
+[system.cp_cntrl0.L1D0cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=2
+block_size=64
+eventq_index=0
+size=65536
+
+[system.cp_cntrl0.L1D1cache]
+type=RubyCache
+children=replacement_policy
+assoc=2
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=2
+eventq_index=0
+is_icache=false
+replacement_policy=system.cp_cntrl0.L1D1cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=65536
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=2
+
+[system.cp_cntrl0.L1D1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=2
+block_size=64
+eventq_index=0
+size=65536
+
+[system.cp_cntrl0.L1Icache]
+type=RubyCache
+children=replacement_policy
+assoc=2
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=2
+eventq_index=0
+is_icache=false
+replacement_policy=system.cp_cntrl0.L1Icache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=32768
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=2
+
+[system.cp_cntrl0.L1Icache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=2
+block_size=64
+eventq_index=0
+size=32768
+
+[system.cp_cntrl0.L2cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.cp_cntrl0.L2cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=2097152
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=16
+
+[system.cp_cntrl0.L2cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=2097152
+
+[system.cp_cntrl0.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.cp_cntrl0.probeToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[0]
+
+[system.cp_cntrl0.requestFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[0]
+
+[system.cp_cntrl0.responseFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[1]
+
+[system.cp_cntrl0.responseToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[1]
+
+[system.cp_cntrl0.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=0
+dcache=system.cp_cntrl0.L1D0cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.cp_cntrl0.L1Icache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=0
+master=system.cpu0.interrupts.pio system.cpu0.interrupts.int_slave
+mem_master_port=system.piobus.slave[0]
+slave=system.cpu0.icache_port system.cpu0.dcache_port system.cpu0.itb.walker.port system.cpu0.dtb.walker.port system.cpu0.interrupts.int_master
+
+[system.cp_cntrl0.sequencer1]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=1
+dcache=system.cp_cntrl0.L1D1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.cp_cntrl0.L1Icache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=1
+
+[system.cp_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.cp_cntrl0.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[2]
+
+[system.cpu0]
+type=TimingSimpleCPU
+children=apic_clk_domain clk_domain dtb interrupts isa itb tracer workload
+branchPred=Null
+checker=Null
+clk_domain=system.cpu0.clk_domain
+cpu_id=0
+do_checkpoint_insts=true
+do_quiesce=true
+do_statistics_insts=true
+dtb=system.cpu0.dtb
+eventq_index=0
+function_trace=false
+function_trace_start=0
+interrupts=system.cpu0.interrupts
+isa=system.cpu0.isa
+itb=system.cpu0.itb
+max_insts_all_threads=0
+max_insts_any_thread=0
+max_loads_all_threads=0
+max_loads_any_thread=0
+numThreads=1
+profile=0
+progress_interval=0
+simpoint_start_insts=
+socket_id=0
+switched_out=false
+system=system
+tracer=system.cpu0.tracer
+workload=system.cpu0.workload
+dcache_port=system.cp_cntrl0.sequencer.slave[1]
+icache_port=system.cp_cntrl0.sequencer.slave[0]
+
+[system.cpu0.apic_clk_domain]
+type=DerivedClockDomain
+clk_divider=16
+clk_domain=system.cpu0.clk_domain
+eventq_index=0
+
+[system.cpu0.clk_domain]
+type=SrcClockDomain
+clock=500
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.voltage_domain
+
+[system.cpu0.dtb]
+type=X86TLB
+children=walker
+eventq_index=0
+size=64
+walker=system.cpu0.dtb.walker
+
+[system.cpu0.dtb.walker]
+type=X86PagetableWalker
+clk_domain=system.cpu0.clk_domain
+eventq_index=0
+num_squash_per_cycle=4
+system=system
+port=system.cp_cntrl0.sequencer.slave[3]
+
+[system.cpu0.interrupts]
+type=X86LocalApic
+clk_domain=system.cpu0.apic_clk_domain
+eventq_index=0
+int_latency=1000
+pio_addr=2305843009213693952
+pio_latency=100000
+system=system
+int_master=system.cp_cntrl0.sequencer.slave[4]
+int_slave=system.cp_cntrl0.sequencer.master[1]
+pio=system.cp_cntrl0.sequencer.master[0]
+
+[system.cpu0.isa]
+type=X86ISA
+eventq_index=0
+
+[system.cpu0.itb]
+type=X86TLB
+children=walker
+eventq_index=0
+size=64
+walker=system.cpu0.itb.walker
+
+[system.cpu0.itb.walker]
+type=X86PagetableWalker
+clk_domain=system.cpu0.clk_domain
+eventq_index=0
+num_squash_per_cycle=4
+system=system
+port=system.cp_cntrl0.sequencer.slave[2]
+
+[system.cpu0.tracer]
+type=ExeTracer
+eventq_index=0
+
+[system.cpu0.workload]
+type=LiveProcess
+cmd=gpu-hello
+cwd=
+drivers=system.cpu2.cl_driver
+egid=100
+env=
+errout=cerr
+euid=100
+eventq_index=0
+executable=/dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello
+gid=100
+input=cin
+kvmInSE=false
+max_stack_size=67108864
+output=cout
+pid=100
+ppid=99
+simpoint=0
+system=system
+uid=100
+useArchPT=false
+
+[system.cpu1]
+type=Shader
+children=CUs0 CUs1 clk_domain
+CUs=system.cpu1.CUs0 system.cpu1.CUs1
+clk_domain=system.cpu1.clk_domain
+cpu_pointer=system.cpu0
+eventq_index=0
+globalmem=65536
+impl_kern_boundary_sync=true
+n_wf=8
+separate_acquire_release=false
+timing=true
+translation=false
+
+[system.cpu1.CUs0]
+type=ComputeUnit
+children=ldsBus localDataStore vector_register_file0 vector_register_file1 vector_register_file2 vector_register_file3 wavefronts00 wavefronts01 wavefronts02 wavefronts03 wavefronts04 wavefronts05 wavefronts06 wavefronts07 wavefronts08 wavefronts09 wavefronts10 wavefronts11 wavefronts12 wavefronts13 wavefronts14 wavefronts15 wavefronts16 wavefronts17 wavefronts18 wavefronts19 wavefronts20 wavefronts21 wavefronts22 wavefronts23 wavefronts24 wavefronts25 wavefronts26 wavefronts27 wavefronts28 wavefronts29 wavefronts30 wavefronts31
+clk_domain=system.cpu1.clk_domain
+coalescer_to_vrf_bus_width=32
+countPages=false
+cu_id=0
+debugSegFault=false
+dpbypass_pipe_length=4
+eventq_index=0
+execPolicy=OLDEST-FIRST
+functionalTLB=true
+global_mem_queue_size=256
+issue_period=4
+localDataStore=system.cpu1.CUs0.localDataStore
+localMemBarrier=false
+local_mem_queue_size=256
+mem_req_latency=9
+mem_resp_latency=9
+n_wf=8
+num_SIMDs=4
+num_global_mem_pipes=1
+num_shared_mem_pipes=1
+perLaneTLB=false
+prefetch_depth=0
+prefetch_prev_type=PF_PHASE
+prefetch_stride=1
+spbypass_pipe_length=4
+system=system
+vector_register_file=system.cpu1.CUs0.vector_register_file0 system.cpu1.CUs0.vector_register_file1 system.cpu1.CUs0.vector_register_file2 system.cpu1.CUs0.vector_register_file3
+vrf_to_coalescer_bus_width=32
+wavefronts=system.cpu1.CUs0.wavefronts00 system.cpu1.CUs0.wavefronts01 system.cpu1.CUs0.wavefronts02 system.cpu1.CUs0.wavefronts03 system.cpu1.CUs0.wavefronts04 system.cpu1.CUs0.wavefronts05 system.cpu1.CUs0.wavefronts06 system.cpu1.CUs0.wavefronts07 system.cpu1.CUs0.wavefronts08 system.cpu1.CUs0.wavefronts09 system.cpu1.CUs0.wavefronts10 system.cpu1.CUs0.wavefronts11 system.cpu1.CUs0.wavefronts12 system.cpu1.CUs0.wavefronts13 system.cpu1.CUs0.wavefronts14 system.cpu1.CUs0.wavefronts15 system.cpu1.CUs0.wavefronts16 system.cpu1.CUs0.wavefronts17 system.cpu1.CUs0.wavefronts18 system.cpu1.CUs0.wavefronts19 system.cpu1.CUs0.wavefronts20 system.cpu1.CUs0.wavefronts21 system.cpu1.CUs0.wavefronts22 system.cpu1.CUs0.wavefronts23 system.cpu1.CUs0.wavefronts24 system.cpu1.CUs0.wavefronts25 system.cpu1.CUs0.wavefronts26 system.cpu1.CUs0.wavefronts27 system.cpu1.CUs0.wavefronts28 system.cpu1.CUs0.wavefronts29 system.cpu1.CUs0.wavefronts30 system.cpu1.CUs0.wavefronts31
+wfSize=64
+xactCasMode=false
+ldsPort=system.cpu1.CUs0.ldsBus.slave
+memory_port=system.tcp_cntrl0.coalescer.slave[0] system.tcp_cntrl0.coalescer.slave[1] system.tcp_cntrl0.coalescer.slave[2] system.tcp_cntrl0.coalescer.slave[3] system.tcp_cntrl0.coalescer.slave[4] system.tcp_cntrl0.coalescer.slave[5] system.tcp_cntrl0.coalescer.slave[6] system.tcp_cntrl0.coalescer.slave[7] system.tcp_cntrl0.coalescer.slave[8] system.tcp_cntrl0.coalescer.slave[9] system.tcp_cntrl0.coalescer.slave[10] system.tcp_cntrl0.coalescer.slave[11] system.tcp_cntrl0.coalescer.slave[12] system.tcp_cntrl0.coalescer.slave[13] system.tcp_cntrl0.coalescer.slave[14] system.tcp_cntrl0.coalescer.slave[15] system.tcp_cntrl0.coalescer.slave[16] system.tcp_cntrl0.coalescer.slave[17] system.tcp_cntrl0.coalescer.slave[18] system.tcp_cntrl0.coalescer.slave[19] system.tcp_cntrl0.coalescer.slave[20] system.tcp_cntrl0.coalescer.slave[21] system.tcp_cntrl0.coalescer.slave[22] system.tcp_cntrl0.coalescer.slave[23] system.tcp_cntrl0.coalescer.slave[24] system.tcp_cntrl0.coalescer.slave[25] system.tcp_cntrl0.coalescer.slave[26] system.tcp_cntrl0.coalescer.slave[27] system.tcp_cntrl0.coalescer.slave[28] system.tcp_cntrl0.coalescer.slave[29] system.tcp_cntrl0.coalescer.slave[30] system.tcp_cntrl0.coalescer.slave[31] system.tcp_cntrl0.coalescer.slave[32] system.tcp_cntrl0.coalescer.slave[33] system.tcp_cntrl0.coalescer.slave[34] system.tcp_cntrl0.coalescer.slave[35] system.tcp_cntrl0.coalescer.slave[36] system.tcp_cntrl0.coalescer.slave[37] system.tcp_cntrl0.coalescer.slave[38] system.tcp_cntrl0.coalescer.slave[39] system.tcp_cntrl0.coalescer.slave[40] system.tcp_cntrl0.coalescer.slave[41] system.tcp_cntrl0.coalescer.slave[42] system.tcp_cntrl0.coalescer.slave[43] system.tcp_cntrl0.coalescer.slave[44] system.tcp_cntrl0.coalescer.slave[45] system.tcp_cntrl0.coalescer.slave[46] system.tcp_cntrl0.coalescer.slave[47] system.tcp_cntrl0.coalescer.slave[48] system.tcp_cntrl0.coalescer.slave[49] system.tcp_cntrl0.coalescer.slave[50] system.tcp_cntrl0.coalescer.slave[51] system.tcp_cntrl0.coalescer.slave[52] system.tcp_cntrl0.coalescer.slave[53] system.tcp_cntrl0.coalescer.slave[54] system.tcp_cntrl0.coalescer.slave[55] system.tcp_cntrl0.coalescer.slave[56] system.tcp_cntrl0.coalescer.slave[57] system.tcp_cntrl0.coalescer.slave[58] system.tcp_cntrl0.coalescer.slave[59] system.tcp_cntrl0.coalescer.slave[60] system.tcp_cntrl0.coalescer.slave[61] system.tcp_cntrl0.coalescer.slave[62] system.tcp_cntrl0.coalescer.slave[63]
+sqc_port=system.sqc_cntrl0.sequencer.slave[0]
+sqc_tlb_port=system.sqc_coalescer.slave[0]
+translation_port=system.l1_coalescer0.slave[0]
+
+[system.cpu1.CUs0.ldsBus]
+type=Bridge
+clk_domain=system.cpu1.clk_domain
+delay=0
+eventq_index=0
+ranges=0:18446744073709551615
+req_size=16
+resp_size=16
+master=system.cpu1.CUs0.localDataStore.cuPort
+slave=system.cpu1.CUs0.ldsPort
+
+[system.cpu1.CUs0.localDataStore]
+type=LdsState
+bankConflictPenalty=1
+banks=32
+clk_domain=system.cpu1.clk_domain
+eventq_index=0
+range=0:65535
+size=65536
+cuPort=system.cpu1.CUs0.ldsBus.master
+
+[system.cpu1.CUs0.vector_register_file0]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=0
+
+[system.cpu1.CUs0.vector_register_file1]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=1
+
+[system.cpu1.CUs0.vector_register_file2]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=2
+
+[system.cpu1.CUs0.vector_register_file3]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=3
+
+[system.cpu1.CUs0.wavefronts00]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=0
+
+[system.cpu1.CUs0.wavefronts01]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=1
+
+[system.cpu1.CUs0.wavefronts02]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=2
+
+[system.cpu1.CUs0.wavefronts03]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=3
+
+[system.cpu1.CUs0.wavefronts04]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=4
+
+[system.cpu1.CUs0.wavefronts05]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=5
+
+[system.cpu1.CUs0.wavefronts06]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=6
+
+[system.cpu1.CUs0.wavefronts07]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=7
+
+[system.cpu1.CUs0.wavefronts08]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=0
+
+[system.cpu1.CUs0.wavefronts09]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=1
+
+[system.cpu1.CUs0.wavefronts10]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=2
+
+[system.cpu1.CUs0.wavefronts11]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=3
+
+[system.cpu1.CUs0.wavefronts12]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=4
+
+[system.cpu1.CUs0.wavefronts13]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=5
+
+[system.cpu1.CUs0.wavefronts14]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=6
+
+[system.cpu1.CUs0.wavefronts15]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=7
+
+[system.cpu1.CUs0.wavefronts16]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=0
+
+[system.cpu1.CUs0.wavefronts17]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=1
+
+[system.cpu1.CUs0.wavefronts18]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=2
+
+[system.cpu1.CUs0.wavefronts19]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=3
+
+[system.cpu1.CUs0.wavefronts20]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=4
+
+[system.cpu1.CUs0.wavefronts21]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=5
+
+[system.cpu1.CUs0.wavefronts22]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=6
+
+[system.cpu1.CUs0.wavefronts23]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=7
+
+[system.cpu1.CUs0.wavefronts24]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=0
+
+[system.cpu1.CUs0.wavefronts25]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=1
+
+[system.cpu1.CUs0.wavefronts26]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=2
+
+[system.cpu1.CUs0.wavefronts27]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=3
+
+[system.cpu1.CUs0.wavefronts28]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=4
+
+[system.cpu1.CUs0.wavefronts29]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=5
+
+[system.cpu1.CUs0.wavefronts30]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=6
+
+[system.cpu1.CUs0.wavefronts31]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=7
+
+[system.cpu1.CUs1]
+type=ComputeUnit
+children=ldsBus localDataStore vector_register_file0 vector_register_file1 vector_register_file2 vector_register_file3 wavefronts00 wavefronts01 wavefronts02 wavefronts03 wavefronts04 wavefronts05 wavefronts06 wavefronts07 wavefronts08 wavefronts09 wavefronts10 wavefronts11 wavefronts12 wavefronts13 wavefronts14 wavefronts15 wavefronts16 wavefronts17 wavefronts18 wavefronts19 wavefronts20 wavefronts21 wavefronts22 wavefronts23 wavefronts24 wavefronts25 wavefronts26 wavefronts27 wavefronts28 wavefronts29 wavefronts30 wavefronts31
+clk_domain=system.cpu1.clk_domain
+coalescer_to_vrf_bus_width=32
+countPages=false
+cu_id=1
+debugSegFault=false
+dpbypass_pipe_length=4
+eventq_index=0
+execPolicy=OLDEST-FIRST
+functionalTLB=true
+global_mem_queue_size=256
+issue_period=4
+localDataStore=system.cpu1.CUs1.localDataStore
+localMemBarrier=false
+local_mem_queue_size=256
+mem_req_latency=9
+mem_resp_latency=9
+n_wf=8
+num_SIMDs=4
+num_global_mem_pipes=1
+num_shared_mem_pipes=1
+perLaneTLB=false
+prefetch_depth=0
+prefetch_prev_type=PF_PHASE
+prefetch_stride=1
+spbypass_pipe_length=4
+system=system
+vector_register_file=system.cpu1.CUs1.vector_register_file0 system.cpu1.CUs1.vector_register_file1 system.cpu1.CUs1.vector_register_file2 system.cpu1.CUs1.vector_register_file3
+vrf_to_coalescer_bus_width=32
+wavefronts=system.cpu1.CUs1.wavefronts00 system.cpu1.CUs1.wavefronts01 system.cpu1.CUs1.wavefronts02 system.cpu1.CUs1.wavefronts03 system.cpu1.CUs1.wavefronts04 system.cpu1.CUs1.wavefronts05 system.cpu1.CUs1.wavefronts06 system.cpu1.CUs1.wavefronts07 system.cpu1.CUs1.wavefronts08 system.cpu1.CUs1.wavefronts09 system.cpu1.CUs1.wavefronts10 system.cpu1.CUs1.wavefronts11 system.cpu1.CUs1.wavefronts12 system.cpu1.CUs1.wavefronts13 system.cpu1.CUs1.wavefronts14 system.cpu1.CUs1.wavefronts15 system.cpu1.CUs1.wavefronts16 system.cpu1.CUs1.wavefronts17 system.cpu1.CUs1.wavefronts18 system.cpu1.CUs1.wavefronts19 system.cpu1.CUs1.wavefronts20 system.cpu1.CUs1.wavefronts21 system.cpu1.CUs1.wavefronts22 system.cpu1.CUs1.wavefronts23 system.cpu1.CUs1.wavefronts24 system.cpu1.CUs1.wavefronts25 system.cpu1.CUs1.wavefronts26 system.cpu1.CUs1.wavefronts27 system.cpu1.CUs1.wavefronts28 system.cpu1.CUs1.wavefronts29 system.cpu1.CUs1.wavefronts30 system.cpu1.CUs1.wavefronts31
+wfSize=64
+xactCasMode=false
+ldsPort=system.cpu1.CUs1.ldsBus.slave
+memory_port=system.tcp_cntrl1.coalescer.slave[0] system.tcp_cntrl1.coalescer.slave[1] system.tcp_cntrl1.coalescer.slave[2] system.tcp_cntrl1.coalescer.slave[3] system.tcp_cntrl1.coalescer.slave[4] system.tcp_cntrl1.coalescer.slave[5] system.tcp_cntrl1.coalescer.slave[6] system.tcp_cntrl1.coalescer.slave[7] system.tcp_cntrl1.coalescer.slave[8] system.tcp_cntrl1.coalescer.slave[9] system.tcp_cntrl1.coalescer.slave[10] system.tcp_cntrl1.coalescer.slave[11] system.tcp_cntrl1.coalescer.slave[12] system.tcp_cntrl1.coalescer.slave[13] system.tcp_cntrl1.coalescer.slave[14] system.tcp_cntrl1.coalescer.slave[15] system.tcp_cntrl1.coalescer.slave[16] system.tcp_cntrl1.coalescer.slave[17] system.tcp_cntrl1.coalescer.slave[18] system.tcp_cntrl1.coalescer.slave[19] system.tcp_cntrl1.coalescer.slave[20] system.tcp_cntrl1.coalescer.slave[21] system.tcp_cntrl1.coalescer.slave[22] system.tcp_cntrl1.coalescer.slave[23] system.tcp_cntrl1.coalescer.slave[24] system.tcp_cntrl1.coalescer.slave[25] system.tcp_cntrl1.coalescer.slave[26] system.tcp_cntrl1.coalescer.slave[27] system.tcp_cntrl1.coalescer.slave[28] system.tcp_cntrl1.coalescer.slave[29] system.tcp_cntrl1.coalescer.slave[30] system.tcp_cntrl1.coalescer.slave[31] system.tcp_cntrl1.coalescer.slave[32] system.tcp_cntrl1.coalescer.slave[33] system.tcp_cntrl1.coalescer.slave[34] system.tcp_cntrl1.coalescer.slave[35] system.tcp_cntrl1.coalescer.slave[36] system.tcp_cntrl1.coalescer.slave[37] system.tcp_cntrl1.coalescer.slave[38] system.tcp_cntrl1.coalescer.slave[39] system.tcp_cntrl1.coalescer.slave[40] system.tcp_cntrl1.coalescer.slave[41] system.tcp_cntrl1.coalescer.slave[42] system.tcp_cntrl1.coalescer.slave[43] system.tcp_cntrl1.coalescer.slave[44] system.tcp_cntrl1.coalescer.slave[45] system.tcp_cntrl1.coalescer.slave[46] system.tcp_cntrl1.coalescer.slave[47] system.tcp_cntrl1.coalescer.slave[48] system.tcp_cntrl1.coalescer.slave[49] system.tcp_cntrl1.coalescer.slave[50] system.tcp_cntrl1.coalescer.slave[51] system.tcp_cntrl1.coalescer.slave[52] system.tcp_cntrl1.coalescer.slave[53] system.tcp_cntrl1.coalescer.slave[54] system.tcp_cntrl1.coalescer.slave[55] system.tcp_cntrl1.coalescer.slave[56] system.tcp_cntrl1.coalescer.slave[57] system.tcp_cntrl1.coalescer.slave[58] system.tcp_cntrl1.coalescer.slave[59] system.tcp_cntrl1.coalescer.slave[60] system.tcp_cntrl1.coalescer.slave[61] system.tcp_cntrl1.coalescer.slave[62] system.tcp_cntrl1.coalescer.slave[63]
+sqc_port=system.sqc_cntrl0.sequencer.slave[1]
+sqc_tlb_port=system.sqc_coalescer.slave[1]
+translation_port=system.l1_coalescer1.slave[0]
+
+[system.cpu1.CUs1.ldsBus]
+type=Bridge
+clk_domain=system.cpu1.clk_domain
+delay=0
+eventq_index=0
+ranges=0:18446744073709551615
+req_size=16
+resp_size=16
+master=system.cpu1.CUs1.localDataStore.cuPort
+slave=system.cpu1.CUs1.ldsPort
+
+[system.cpu1.CUs1.localDataStore]
+type=LdsState
+bankConflictPenalty=1
+banks=32
+clk_domain=system.cpu1.clk_domain
+eventq_index=0
+range=0:65535
+size=65536
+cuPort=system.cpu1.CUs1.ldsBus.master
+
+[system.cpu1.CUs1.vector_register_file0]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=0
+
+[system.cpu1.CUs1.vector_register_file1]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=1
+
+[system.cpu1.CUs1.vector_register_file2]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=2
+
+[system.cpu1.CUs1.vector_register_file3]
+type=VectorRegisterFile
+eventq_index=0
+min_alloc=4
+num_regs_per_simd=2048
+simd_id=3
+
+[system.cpu1.CUs1.wavefronts00]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=0
+
+[system.cpu1.CUs1.wavefronts01]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=1
+
+[system.cpu1.CUs1.wavefronts02]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=2
+
+[system.cpu1.CUs1.wavefronts03]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=3
+
+[system.cpu1.CUs1.wavefronts04]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=4
+
+[system.cpu1.CUs1.wavefronts05]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=5
+
+[system.cpu1.CUs1.wavefronts06]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=6
+
+[system.cpu1.CUs1.wavefronts07]
+type=Wavefront
+eventq_index=0
+simdId=0
+wf_slot_id=7
+
+[system.cpu1.CUs1.wavefronts08]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=0
+
+[system.cpu1.CUs1.wavefronts09]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=1
+
+[system.cpu1.CUs1.wavefronts10]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=2
+
+[system.cpu1.CUs1.wavefronts11]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=3
+
+[system.cpu1.CUs1.wavefronts12]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=4
+
+[system.cpu1.CUs1.wavefronts13]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=5
+
+[system.cpu1.CUs1.wavefronts14]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=6
+
+[system.cpu1.CUs1.wavefronts15]
+type=Wavefront
+eventq_index=0
+simdId=1
+wf_slot_id=7
+
+[system.cpu1.CUs1.wavefronts16]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=0
+
+[system.cpu1.CUs1.wavefronts17]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=1
+
+[system.cpu1.CUs1.wavefronts18]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=2
+
+[system.cpu1.CUs1.wavefronts19]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=3
+
+[system.cpu1.CUs1.wavefronts20]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=4
+
+[system.cpu1.CUs1.wavefronts21]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=5
+
+[system.cpu1.CUs1.wavefronts22]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=6
+
+[system.cpu1.CUs1.wavefronts23]
+type=Wavefront
+eventq_index=0
+simdId=2
+wf_slot_id=7
+
+[system.cpu1.CUs1.wavefronts24]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=0
+
+[system.cpu1.CUs1.wavefronts25]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=1
+
+[system.cpu1.CUs1.wavefronts26]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=2
+
+[system.cpu1.CUs1.wavefronts27]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=3
+
+[system.cpu1.CUs1.wavefronts28]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=4
+
+[system.cpu1.CUs1.wavefronts29]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=5
+
+[system.cpu1.CUs1.wavefronts30]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=6
+
+[system.cpu1.CUs1.wavefronts31]
+type=Wavefront
+eventq_index=0
+simdId=3
+wf_slot_id=7
+
+[system.cpu1.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.cpu1.clk_domain.voltage_domain
+
+[system.cpu1.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.cpu2]
+type=GpuDispatcher
+children=cl_driver
+cl_driver=system.cpu2.cl_driver
+clk_domain=system.clk_domain
+cpu=system.cpu0
+eventq_index=0
+pio_addr=8589934592
+pio_latency=1000
+shader_pointer=system.cpu1
+system=system
+dma=system.piobus.slave[1]
+pio=system.piobus.master[0]
+translation_port=system.dispatcher_coalescer.slave[0]
+
+[system.cpu2.cl_driver]
+type=ClDriver
+codefile=/dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm
+eventq_index=0
+filename=hsa
+
+[system.dir_cntrl0]
+type=Directory_Controller
+children=L3CacheMemory L3triggerQueue directory probeToCore reqFromRegBuf reqFromRegDir reqToRegDir requestFromCores responseFromCores responseFromMemory responseToCore triggerQueue unblockFromCores unblockToRegDir
+L3CacheMemory=system.dir_cntrl0.L3CacheMemory
+L3triggerQueue=system.dir_cntrl0.L3triggerQueue
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+directory=system.dir_cntrl0.directory
+eventq_index=0
+l3_hit_latency=15
+number_of_TBEs=5120
+probeToCore=system.dir_cntrl0.probeToCore
+recycle_latency=10
+reqFromRegBuf=system.dir_cntrl0.reqFromRegBuf
+reqFromRegDir=system.dir_cntrl0.reqFromRegDir
+reqToRegDir=system.dir_cntrl0.reqToRegDir
+requestFromCores=system.dir_cntrl0.requestFromCores
+responseFromCores=system.dir_cntrl0.responseFromCores
+responseFromMemory=system.dir_cntrl0.responseFromMemory
+responseToCore=system.dir_cntrl0.responseToCore
+response_latency=25
+response_latency_regionDir=1
+ruby_system=system.ruby
+system=system
+to_memory_controller_latency=1
+transitions_per_cycle=32
+triggerQueue=system.dir_cntrl0.triggerQueue
+unblockFromCores=system.dir_cntrl0.unblockFromCores
+unblockToRegDir=system.dir_cntrl0.unblockToRegDir
+useL3OnWT=false
+version=0
+memory=system.mem_ctrls.port
+
+[system.dir_cntrl0.L3CacheMemory]
+type=RubyCache
+children=replacement_policy
+assoc=16
+block_size=0
+dataAccessLatency=20
+dataArrayBanks=16.0
+eventq_index=0
+is_icache=false
+replacement_policy=system.dir_cntrl0.L3CacheMemory.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=16777216
+start_index_bit=6
+tagAccessLatency=15
+tagArrayBanks=16.0
+
+[system.dir_cntrl0.L3CacheMemory.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=16
+block_size=64
+eventq_index=0
+size=16777216
+
+[system.dir_cntrl0.L3triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.dir_cntrl0.directory]
+type=RubyDirectoryMemory
+eventq_index=0
+numa_high_bit=5
+size=536870912
+version=0
+
+[system.dir_cntrl0.probeToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[18]
+
+[system.dir_cntrl0.reqFromRegBuf]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[24]
+
+[system.dir_cntrl0.reqFromRegDir]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[25]
+
+[system.dir_cntrl0.reqToRegDir]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[20]
+
+[system.dir_cntrl0.requestFromCores]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[21]
+
+[system.dir_cntrl0.responseFromCores]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[22]
+
+[system.dir_cntrl0.responseFromMemory]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.dir_cntrl0.responseToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[19]
+
+[system.dir_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.dir_cntrl0.unblockFromCores]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[23]
+
+[system.dir_cntrl0.unblockToRegDir]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[21]
+
+[system.dispatcher_coalescer]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.dispatcher_coalescer.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.dispatcher_tlb.slave[0]
+slave=system.cpu2.translation_port
+
+[system.dispatcher_coalescer.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.dispatcher_coalescer.clk_domain.voltage_domain
+
+[system.dispatcher_coalescer.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.dispatcher_tlb]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.dispatcher_tlb.clk_domain
+eventq_index=0
+hitLatency=1
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=32
+master=system.l2_coalescer.slave[1]
+slave=system.dispatcher_coalescer.master[0]
+
+[system.dispatcher_tlb.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.dispatcher_tlb.clk_domain.voltage_domain
+
+[system.dispatcher_tlb.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.dvfs_handler]
+type=DVFSHandler
+domains=
+enable=false
+eventq_index=0
+sys_clk_domain=system.clk_domain
+transition_latency=100000000
+
+[system.l1_coalescer0]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.l1_coalescer0.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.l1_tlb0.slave[0]
+slave=system.cpu1.CUs0.translation_port[0]
+
+[system.l1_coalescer0.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l1_coalescer0.clk_domain.voltage_domain
+
+[system.l1_coalescer0.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l1_coalescer1]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.l1_coalescer1.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.l1_tlb1.slave[0]
+slave=system.cpu1.CUs1.translation_port[0]
+
+[system.l1_coalescer1.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l1_coalescer1.clk_domain.voltage_domain
+
+[system.l1_coalescer1.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l1_tlb0]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.l1_tlb0.clk_domain
+eventq_index=0
+hitLatency=1
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=32
+master=system.l2_coalescer.slave[2]
+slave=system.l1_coalescer0.master[0]
+
+[system.l1_tlb0.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l1_tlb0.clk_domain.voltage_domain
+
+[system.l1_tlb0.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l1_tlb1]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.l1_tlb1.clk_domain
+eventq_index=0
+hitLatency=1
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=32
+master=system.l2_coalescer.slave[3]
+slave=system.l1_coalescer1.master[0]
+
+[system.l1_tlb1.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l1_tlb1.clk_domain.voltage_domain
+
+[system.l1_tlb1.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l2_coalescer]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.l2_coalescer.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.l2_tlb.slave[0]
+slave=system.sqc_tlb.master[0] system.dispatcher_tlb.master[0] system.l1_tlb0.master[0] system.l1_tlb1.master[0]
+
+[system.l2_coalescer.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l2_coalescer.clk_domain.voltage_domain
+
+[system.l2_coalescer.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l2_tlb]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.l2_tlb.clk_domain
+eventq_index=0
+hitLatency=69
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=4096
+master=system.l3_coalescer.slave[0]
+slave=system.l2_coalescer.master[0]
+
+[system.l2_tlb.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l2_tlb.clk_domain.voltage_domain
+
+[system.l2_tlb.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l3_coalescer]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.l3_coalescer.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.l3_tlb.slave[0]
+slave=system.l2_tlb.master[0]
+
+[system.l3_coalescer.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l3_coalescer.clk_domain.voltage_domain
+
+[system.l3_coalescer.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.l3_tlb]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.l3_tlb.clk_domain
+eventq_index=0
+hitLatency=150
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=8192
+slave=system.l3_coalescer.master[0]
+
+[system.l3_tlb.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.l3_tlb.clk_domain.voltage_domain
+
+[system.l3_tlb.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.mem_ctrls]
+type=DRAMCtrl
+IDD0=0.075000
+IDD02=0.000000
+IDD2N=0.050000
+IDD2N2=0.000000
+IDD2P0=0.000000
+IDD2P02=0.000000
+IDD2P1=0.000000
+IDD2P12=0.000000
+IDD3N=0.057000
+IDD3N2=0.000000
+IDD3P0=0.000000
+IDD3P02=0.000000
+IDD3P1=0.000000
+IDD3P12=0.000000
+IDD4R=0.187000
+IDD4R2=0.000000
+IDD4W=0.165000
+IDD4W2=0.000000
+IDD5=0.220000
+IDD52=0.000000
+IDD6=0.000000
+IDD62=0.000000
+VDD=1.500000
+VDD2=0.000000
+activation_limit=4
+addr_mapping=RoRaBaCoCh
+bank_groups_per_rank=0
+banks_per_rank=8
+burst_length=8
+channels=1
+clk_domain=system.clk_domain
+conf_table_reported=true
+device_bus_width=8
+device_rowbuffer_size=1024
+device_size=536870912
+devices_per_rank=8
+dll=true
+eventq_index=0
+in_addr_map=true
+max_accesses_per_row=16
+mem_sched_policy=frfcfs
+min_writes_per_switch=16
+null=false
+page_policy=open_adaptive
+range=0:536870911
+ranks_per_channel=2
+read_buffer_size=32
+static_backend_latency=10000
+static_frontend_latency=10000
+tBURST=5000
+tCCD_L=0
+tCK=1250
+tCL=13750
+tCS=2500
+tRAS=35000
+tRCD=13750
+tREFI=7800000
+tRFC=260000
+tRP=13750
+tRRD=6000
+tRRD_L=0
+tRTP=7500
+tRTW=2500
+tWR=15000
+tWTR=7500
+tXAW=30000
+tXP=0
+tXPDLL=0
+tXS=0
+tXSDLL=0
+write_buffer_size=64
+write_high_thresh_perc=85
+write_low_thresh_perc=50
+port=system.dir_cntrl0.memory
+
+[system.piobus]
+type=NoncoherentXBar
+clk_domain=system.clk_domain
+eventq_index=0
+forward_latency=0
+frontend_latency=0
+response_latency=0
+use_default_range=false
+width=32
+master=system.cpu2.pio
+slave=system.cp_cntrl0.sequencer.mem_master_port system.cpu2.dma
+
+[system.rb_cntrl0]
+type=RegionBuffer_Controller
+children=cacheMemory notifyFromRegionDir probeFromRegionDir requestFromCore requestToNetwork responseFromCore responseToRegDir triggerQueue unblockFromDir
+TCC_select_num_bits=0
+blocksPerRegion=16
+buffer_size=0
+cacheMemory=system.rb_cntrl0.cacheMemory
+clk_domain=system.clk_domain
+cluster_id=0
+eventq_index=0
+isOnCPU=true
+nextEvictLatency=1
+noTCCdir=true
+notifyFromRegionDir=system.rb_cntrl0.notifyFromRegionDir
+number_of_TBEs=256
+probeFromRegionDir=system.rb_cntrl0.probeFromRegionDir
+recycle_latency=10
+requestFromCore=system.rb_cntrl0.requestFromCore
+requestToNetwork=system.rb_cntrl0.requestToNetwork
+responseFromCore=system.rb_cntrl0.responseFromCore
+responseToRegDir=system.rb_cntrl0.responseToRegDir
+ruby_system=system.ruby
+system=system
+toDirLatency=60
+toRegionDirLatency=120
+transitions_per_cycle=32
+triggerQueue=system.rb_cntrl0.triggerQueue
+unblockFromDir=system.rb_cntrl0.unblockFromDir
+version=0
+
+[system.rb_cntrl0.cacheMemory]
+type=RubyCache
+children=replacement_policy
+assoc=4
+block_size=1024
+dataAccessLatency=1
+dataArrayBanks=64
+eventq_index=0
+is_icache=false
+replacement_policy=system.rb_cntrl0.cacheMemory.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=1048576
+start_index_bit=10
+tagAccessLatency=1
+tagArrayBanks=64
+
+[system.rb_cntrl0.cacheMemory.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=4
+block_size=64
+eventq_index=0
+size=1048576
+
+[system.rb_cntrl0.notifyFromRegionDir]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[4]
+
+[system.rb_cntrl0.probeFromRegionDir]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[5]
+
+[system.rb_cntrl0.requestFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[2]
+
+[system.rb_cntrl0.requestToNetwork]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[3]
+
+[system.rb_cntrl0.responseFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[3]
+
+[system.rb_cntrl0.responseToRegDir]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[4]
+
+[system.rb_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.rb_cntrl0.unblockFromDir]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[6]
+
+[system.reg_cntrl0]
+type=RegionDir_Controller
+children=cacheMemory notifyToRBuffer probeToRBuffer requestFromRegBuf requestToDir responseFromRBuffer triggerQueue
+TCC_select_num_bits=0
+always_migrate=false
+asym_migrate=false
+blocksPerRegion=16
+buffer_size=0
+cacheMemory=system.reg_cntrl0.cacheMemory
+clk_domain=system.clk_domain
+cluster_id=0
+cpuRegionBufferNum=0
+eventq_index=0
+gpuRegionBufferNum=1
+noTCCdir=true
+notifyToRBuffer=system.reg_cntrl0.notifyToRBuffer
+number_of_TBEs=32
+probeToRBuffer=system.reg_cntrl0.probeToRBuffer
+recycle_latency=10
+requestFromRegBuf=system.reg_cntrl0.requestFromRegBuf
+requestToDir=system.reg_cntrl0.requestToDir
+responseFromRBuffer=system.reg_cntrl0.responseFromRBuffer
+ruby_system=system.ruby
+sym_migrate=false
+system=system
+toDirLatency=1
+transitions_per_cycle=32
+triggerQueue=system.reg_cntrl0.triggerQueue
+version=0
+
+[system.reg_cntrl0.cacheMemory]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=1024
+dataAccessLatency=1
+dataArrayBanks=1
+eventq_index=0
+is_icache=false
+replacement_policy=system.reg_cntrl0.cacheMemory.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=2097152
+start_index_bit=10
+tagAccessLatency=4
+tagArrayBanks=8
+
+[system.reg_cntrl0.cacheMemory.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=2097152
+
+[system.reg_cntrl0.notifyToRBuffer]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[23]
+
+[system.reg_cntrl0.probeToRBuffer]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[24]
+
+[system.reg_cntrl0.requestFromRegBuf]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[27]
+
+[system.reg_cntrl0.requestToDir]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[22]
+
+[system.reg_cntrl0.responseFromRBuffer]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[26]
+
+[system.reg_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby]
+type=RubySystem
+children=clk_domain network phys_mem
+access_backing_store=true
+all_instructions=false
+block_size_bytes=64
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+hot_lines=false
+memory_size_bits=48
+num_of_sequencers=5
+number_of_virtual_networks=10
+phys_mem=system.ruby.phys_mem
+randomization=false
+
+[system.ruby.clk_domain]
+type=SrcClockDomain
+clock=500
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.voltage_domain
+
+[system.ruby.network]
+type=SimpleNetwork
+children=ext_links0 ext_links1 ext_links2 ext_links3 ext_links4 ext_links5 ext_links6 ext_links7 ext_links8 int_link_buffers00 int_link_buffers01 int_link_buffers02 int_link_buffers03 int_link_buffers04 int_link_buffers05 int_link_buffers06 int_link_buffers07 int_link_buffers08 int_link_buffers09 int_link_buffers10 int_link_buffers11 int_link_buffers12 int_link_buffers13 int_link_buffers14 int_link_buffers15 int_link_buffers16 int_link_buffers17 int_link_buffers18 int_link_buffers19 int_link_buffers20 int_link_buffers21 int_link_buffers22 int_link_buffers23 int_link_buffers24 int_link_buffers25 int_link_buffers26 int_link_buffers27 int_link_buffers28 int_link_buffers29 int_link_buffers30 int_link_buffers31 int_link_buffers32 int_link_buffers33 int_link_buffers34 int_link_buffers35 int_link_buffers36 int_link_buffers37 int_link_buffers38 int_link_buffers39 int_links0 int_links1
+adaptive_routing=false
+buffer_size=0
+clk_domain=system.ruby.clk_domain
+control_msg_size=8
+endpoint_bandwidth=1000
+eventq_index=0
+ext_links=system.ruby.network.ext_links0 system.ruby.network.ext_links1 system.ruby.network.ext_links2 system.ruby.network.ext_links3 system.ruby.network.ext_links4 system.ruby.network.ext_links5 system.ruby.network.ext_links6 system.ruby.network.ext_links7 system.ruby.network.ext_links8
+int_link_buffers=system.ruby.network.int_link_buffers00 system.ruby.network.int_link_buffers01 system.ruby.network.int_link_buffers02 system.ruby.network.int_link_buffers03 system.ruby.network.int_link_buffers04 system.ruby.network.int_link_buffers05 system.ruby.network.int_link_buffers06 system.ruby.network.int_link_buffers07 system.ruby.network.int_link_buffers08 system.ruby.network.int_link_buffers09 system.ruby.network.int_link_buffers10 system.ruby.network.int_link_buffers11 system.ruby.network.int_link_buffers12 system.ruby.network.int_link_buffers13 system.ruby.network.int_link_buffers14 system.ruby.network.int_link_buffers15 system.ruby.network.int_link_buffers16 system.ruby.network.int_link_buffers17 system.ruby.network.int_link_buffers18 system.ruby.network.int_link_buffers19 system.ruby.network.int_link_buffers20 system.ruby.network.int_link_buffers21 system.ruby.network.int_link_buffers22 system.ruby.network.int_link_buffers23 system.ruby.network.int_link_buffers24 system.ruby.network.int_link_buffers25 system.ruby.network.int_link_buffers26 system.ruby.network.int_link_buffers27 system.ruby.network.int_link_buffers28 system.ruby.network.int_link_buffers29 system.ruby.network.int_link_buffers30 system.ruby.network.int_link_buffers31 system.ruby.network.int_link_buffers32 system.ruby.network.int_link_buffers33 system.ruby.network.int_link_buffers34 system.ruby.network.int_link_buffers35 system.ruby.network.int_link_buffers36 system.ruby.network.int_link_buffers37 system.ruby.network.int_link_buffers38 system.ruby.network.int_link_buffers39
+int_links=system.ruby.network.int_links0 system.ruby.network.int_links1
+netifs=
+number_of_virtual_networks=10
+routers=system.ruby.network.ext_links0.int_node system.ruby.network.ext_links2.int_node system.ruby.network.ext_links4.int_node
+ruby_system=system.ruby
+topology=Crossbar
+master=system.cp_cntrl0.probeToCore.slave system.cp_cntrl0.responseToCore.slave system.rb_cntrl0.requestFromCore.slave system.rb_cntrl0.responseFromCore.slave system.rb_cntrl0.notifyFromRegionDir.slave system.rb_cntrl0.probeFromRegionDir.slave system.rb_cntrl0.unblockFromDir.slave system.tcp_cntrl0.probeToTCP.slave system.tcp_cntrl0.responseToTCP.slave system.tcp_cntrl1.probeToTCP.slave system.tcp_cntrl1.responseToTCP.slave system.sqc_cntrl0.probeToSQC.slave system.sqc_cntrl0.responseToSQC.slave system.tcc_cntrl0.requestFromTCP.slave system.tcc_cntrl0.probeFromNB.slave system.tcc_cntrl0.responseFromNB.slave system.tcc_rb_cntrl0.requestFromCore.slave system.tcc_rb_cntrl0.responseFromCore.slave system.tcc_rb_cntrl0.notifyFromRegionDir.slave system.tcc_rb_cntrl0.probeFromRegionDir.slave system.tcc_rb_cntrl0.unblockFromDir.slave system.dir_cntrl0.requestFromCores.slave system.dir_cntrl0.responseFromCores.slave system.dir_cntrl0.unblockFromCores.slave system.dir_cntrl0.reqFromRegBuf.slave system.dir_cntrl0.reqFromRegDir.slave system.reg_cntrl0.responseFromRBuffer.slave system.reg_cntrl0.requestFromRegBuf.slave
+slave=system.cp_cntrl0.requestFromCore.master system.cp_cntrl0.responseFromCore.master system.cp_cntrl0.unblockFromCore.master system.rb_cntrl0.requestToNetwork.master system.rb_cntrl0.responseToRegDir.master system.tcp_cntrl0.requestFromTCP.master system.tcp_cntrl0.responseFromTCP.master system.tcp_cntrl0.unblockFromCore.master system.tcp_cntrl1.requestFromTCP.master system.tcp_cntrl1.responseFromTCP.master system.tcp_cntrl1.unblockFromCore.master system.sqc_cntrl0.requestFromSQC.master system.tcc_cntrl0.responseToCore.master system.tcc_cntrl0.requestToNB.master system.tcc_cntrl0.responseToNB.master system.tcc_cntrl0.unblockToNB.master system.tcc_rb_cntrl0.requestToNetwork.master system.tcc_rb_cntrl0.responseToRegDir.master system.dir_cntrl0.probeToCore.master system.dir_cntrl0.responseToCore.master system.dir_cntrl0.reqToRegDir.master system.dir_cntrl0.unblockToRegDir.master system.reg_cntrl0.requestToDir.master system.reg_cntrl0.notifyToRBuffer.master system.reg_cntrl0.probeToRBuffer.master
+
+[system.ruby.network.ext_links0]
+type=SimpleExtLink
+children=int_node
+bandwidth_factor=32
+eventq_index=0
+ext_node=system.dir_cntrl0
+int_node=system.ruby.network.ext_links0.int_node
+latency=1
+link_id=0
+weight=1
+
+[system.ruby.network.ext_links0.int_node]
+type=Switch
+children=port_buffers000 port_buffers001 port_buffers002 port_buffers003 port_buffers004 port_buffers005 port_buffers006 port_buffers007 port_buffers008 port_buffers009 port_buffers010 port_buffers011 port_buffers012 port_buffers013 port_buffers014 port_buffers015 port_buffers016 port_buffers017 port_buffers018 port_buffers019 port_buffers020 port_buffers021 port_buffers022 port_buffers023 port_buffers024 port_buffers025 port_buffers026 port_buffers027 port_buffers028 port_buffers029 port_buffers030 port_buffers031 port_buffers032 port_buffers033 port_buffers034 port_buffers035 port_buffers036 port_buffers037 port_buffers038 port_buffers039 port_buffers040 port_buffers041 port_buffers042 port_buffers043 port_buffers044 port_buffers045 port_buffers046 port_buffers047 port_buffers048 port_buffers049 port_buffers050 port_buffers051 port_buffers052 port_buffers053 port_buffers054 port_buffers055 port_buffers056 port_buffers057 port_buffers058 port_buffers059 port_buffers060 port_buffers061 port_buffers062 port_buffers063 port_buffers064 port_buffers065 port_buffers066 port_buffers067 port_buffers068 port_buffers069 port_buffers070 port_buffers071 port_buffers072 port_buffers073 port_buffers074 port_buffers075 port_buffers076 port_buffers077 port_buffers078 port_buffers079 port_buffers080 port_buffers081 port_buffers082 port_buffers083 port_buffers084 port_buffers085 port_buffers086 port_buffers087 port_buffers088 port_buffers089 port_buffers090 port_buffers091 port_buffers092 port_buffers093 port_buffers094 port_buffers095 port_buffers096 port_buffers097 port_buffers098 port_buffers099 port_buffers100 port_buffers101 port_buffers102 port_buffers103 port_buffers104 port_buffers105 port_buffers106 port_buffers107 port_buffers108 port_buffers109
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+port_buffers=system.ruby.network.ext_links0.int_node.port_buffers000 system.ruby.network.ext_links0.int_node.port_buffers001 system.ruby.network.ext_links0.int_node.port_buffers002 system.ruby.network.ext_links0.int_node.port_buffers003 system.ruby.network.ext_links0.int_node.port_buffers004 system.ruby.network.ext_links0.int_node.port_buffers005 system.ruby.network.ext_links0.int_node.port_buffers006 system.ruby.network.ext_links0.int_node.port_buffers007 system.ruby.network.ext_links0.int_node.port_buffers008 system.ruby.network.ext_links0.int_node.port_buffers009 system.ruby.network.ext_links0.int_node.port_buffers010 system.ruby.network.ext_links0.int_node.port_buffers011 system.ruby.network.ext_links0.int_node.port_buffers012 system.ruby.network.ext_links0.int_node.port_buffers013 system.ruby.network.ext_links0.int_node.port_buffers014 system.ruby.network.ext_links0.int_node.port_buffers015 system.ruby.network.ext_links0.int_node.port_buffers016 system.ruby.network.ext_links0.int_node.port_buffers017 system.ruby.network.ext_links0.int_node.port_buffers018 system.ruby.network.ext_links0.int_node.port_buffers019 system.ruby.network.ext_links0.int_node.port_buffers020 system.ruby.network.ext_links0.int_node.port_buffers021 system.ruby.network.ext_links0.int_node.port_buffers022 system.ruby.network.ext_links0.int_node.port_buffers023 system.ruby.network.ext_links0.int_node.port_buffers024 system.ruby.network.ext_links0.int_node.port_buffers025 system.ruby.network.ext_links0.int_node.port_buffers026 system.ruby.network.ext_links0.int_node.port_buffers027 system.ruby.network.ext_links0.int_node.port_buffers028 system.ruby.network.ext_links0.int_node.port_buffers029 system.ruby.network.ext_links0.int_node.port_buffers030 system.ruby.network.ext_links0.int_node.port_buffers031 system.ruby.network.ext_links0.int_node.port_buffers032 system.ruby.network.ext_links0.int_node.port_buffers033 system.ruby.network.ext_links0.int_node.port_buffers034 system.ruby.network.ext_links0.int_node.port_buffers035 system.ruby.network.ext_links0.int_node.port_buffers036 system.ruby.network.ext_links0.int_node.port_buffers037 system.ruby.network.ext_links0.int_node.port_buffers038 system.ruby.network.ext_links0.int_node.port_buffers039 system.ruby.network.ext_links0.int_node.port_buffers040 system.ruby.network.ext_links0.int_node.port_buffers041 system.ruby.network.ext_links0.int_node.port_buffers042 system.ruby.network.ext_links0.int_node.port_buffers043 system.ruby.network.ext_links0.int_node.port_buffers044 system.ruby.network.ext_links0.int_node.port_buffers045 system.ruby.network.ext_links0.int_node.port_buffers046 system.ruby.network.ext_links0.int_node.port_buffers047 system.ruby.network.ext_links0.int_node.port_buffers048 system.ruby.network.ext_links0.int_node.port_buffers049 system.ruby.network.ext_links0.int_node.port_buffers050 system.ruby.network.ext_links0.int_node.port_buffers051 system.ruby.network.ext_links0.int_node.port_buffers052 system.ruby.network.ext_links0.int_node.port_buffers053 system.ruby.network.ext_links0.int_node.port_buffers054 system.ruby.network.ext_links0.int_node.port_buffers055 system.ruby.network.ext_links0.int_node.port_buffers056 system.ruby.network.ext_links0.int_node.port_buffers057 system.ruby.network.ext_links0.int_node.port_buffers058 system.ruby.network.ext_links0.int_node.port_buffers059 system.ruby.network.ext_links0.int_node.port_buffers060 system.ruby.network.ext_links0.int_node.port_buffers061 system.ruby.network.ext_links0.int_node.port_buffers062 system.ruby.network.ext_links0.int_node.port_buffers063 system.ruby.network.ext_links0.int_node.port_buffers064 system.ruby.network.ext_links0.int_node.port_buffers065 system.ruby.network.ext_links0.int_node.port_buffers066 system.ruby.network.ext_links0.int_node.port_buffers067 system.ruby.network.ext_links0.int_node.port_buffers068 system.ruby.network.ext_links0.int_node.port_buffers069 system.ruby.network.ext_links0.int_node.port_buffers070 system.ruby.network.ext_links0.int_node.port_buffers071 system.ruby.network.ext_links0.int_node.port_buffers072 system.ruby.network.ext_links0.int_node.port_buffers073 system.ruby.network.ext_links0.int_node.port_buffers074 system.ruby.network.ext_links0.int_node.port_buffers075 system.ruby.network.ext_links0.int_node.port_buffers076 system.ruby.network.ext_links0.int_node.port_buffers077 system.ruby.network.ext_links0.int_node.port_buffers078 system.ruby.network.ext_links0.int_node.port_buffers079 system.ruby.network.ext_links0.int_node.port_buffers080 system.ruby.network.ext_links0.int_node.port_buffers081 system.ruby.network.ext_links0.int_node.port_buffers082 system.ruby.network.ext_links0.int_node.port_buffers083 system.ruby.network.ext_links0.int_node.port_buffers084 system.ruby.network.ext_links0.int_node.port_buffers085 system.ruby.network.ext_links0.int_node.port_buffers086 system.ruby.network.ext_links0.int_node.port_buffers087 system.ruby.network.ext_links0.int_node.port_buffers088 system.ruby.network.ext_links0.int_node.port_buffers089 system.ruby.network.ext_links0.int_node.port_buffers090 system.ruby.network.ext_links0.int_node.port_buffers091 system.ruby.network.ext_links0.int_node.port_buffers092 system.ruby.network.ext_links0.int_node.port_buffers093 system.ruby.network.ext_links0.int_node.port_buffers094 system.ruby.network.ext_links0.int_node.port_buffers095 system.ruby.network.ext_links0.int_node.port_buffers096 system.ruby.network.ext_links0.int_node.port_buffers097 system.ruby.network.ext_links0.int_node.port_buffers098 system.ruby.network.ext_links0.int_node.port_buffers099 system.ruby.network.ext_links0.int_node.port_buffers100 system.ruby.network.ext_links0.int_node.port_buffers101 system.ruby.network.ext_links0.int_node.port_buffers102 system.ruby.network.ext_links0.int_node.port_buffers103 system.ruby.network.ext_links0.int_node.port_buffers104 system.ruby.network.ext_links0.int_node.port_buffers105 system.ruby.network.ext_links0.int_node.port_buffers106 system.ruby.network.ext_links0.int_node.port_buffers107 system.ruby.network.ext_links0.int_node.port_buffers108 system.ruby.network.ext_links0.int_node.port_buffers109
+router_id=0
+virt_nets=10
+
+[system.ruby.network.ext_links0.int_node.port_buffers000]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers001]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers002]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers003]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers004]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers005]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers006]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers007]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers008]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers009]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers010]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers011]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers012]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers013]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers014]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers015]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers016]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers017]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers018]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers019]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers020]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers021]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers022]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers023]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers024]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers025]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers026]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers027]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers028]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers029]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers030]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers031]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers032]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers033]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers034]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers035]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers036]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers037]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers038]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers039]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers040]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers041]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers042]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers043]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers044]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers045]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers046]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers047]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers048]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers049]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers050]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers051]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers052]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers053]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers054]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers055]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers056]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers057]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers058]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers059]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers060]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers061]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers062]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers063]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers064]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers065]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers066]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers067]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers068]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers069]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers070]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers071]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers072]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers073]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers074]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers075]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers076]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers077]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers078]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers079]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers080]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers081]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers082]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers083]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers084]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers085]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers086]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers087]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers088]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers089]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers090]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers091]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers092]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers093]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers094]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers095]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers096]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers097]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers098]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers099]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers100]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers101]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers102]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers103]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers104]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers105]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers106]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers107]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers108]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links0.int_node.port_buffers109]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links1]
+type=SimpleExtLink
+bandwidth_factor=32
+eventq_index=0
+ext_node=system.reg_cntrl0
+int_node=system.ruby.network.ext_links0.int_node
+latency=1
+link_id=1
+weight=1
+
+[system.ruby.network.ext_links2]
+type=SimpleExtLink
+children=int_node
+bandwidth_factor=32
+eventq_index=0
+ext_node=system.cp_cntrl0
+int_node=system.ruby.network.ext_links2.int_node
+latency=1
+link_id=2
+weight=1
+
+[system.ruby.network.ext_links2.int_node]
+type=Switch
+children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 port_buffers70 port_buffers71 port_buffers72 port_buffers73 port_buffers74 port_buffers75 port_buffers76 port_buffers77 port_buffers78 port_buffers79 port_buffers80 port_buffers81 port_buffers82 port_buffers83 port_buffers84 port_buffers85 port_buffers86 port_buffers87 port_buffers88 port_buffers89 port_buffers90 port_buffers91 port_buffers92 port_buffers93 port_buffers94 port_buffers95 port_buffers96 port_buffers97 port_buffers98 port_buffers99
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+port_buffers=system.ruby.network.ext_links2.int_node.port_buffers00 system.ruby.network.ext_links2.int_node.port_buffers01 system.ruby.network.ext_links2.int_node.port_buffers02 system.ruby.network.ext_links2.int_node.port_buffers03 system.ruby.network.ext_links2.int_node.port_buffers04 system.ruby.network.ext_links2.int_node.port_buffers05 system.ruby.network.ext_links2.int_node.port_buffers06 system.ruby.network.ext_links2.int_node.port_buffers07 system.ruby.network.ext_links2.int_node.port_buffers08 system.ruby.network.ext_links2.int_node.port_buffers09 system.ruby.network.ext_links2.int_node.port_buffers10 system.ruby.network.ext_links2.int_node.port_buffers11 system.ruby.network.ext_links2.int_node.port_buffers12 system.ruby.network.ext_links2.int_node.port_buffers13 system.ruby.network.ext_links2.int_node.port_buffers14 system.ruby.network.ext_links2.int_node.port_buffers15 system.ruby.network.ext_links2.int_node.port_buffers16 system.ruby.network.ext_links2.int_node.port_buffers17 system.ruby.network.ext_links2.int_node.port_buffers18 system.ruby.network.ext_links2.int_node.port_buffers19 system.ruby.network.ext_links2.int_node.port_buffers20 system.ruby.network.ext_links2.int_node.port_buffers21 system.ruby.network.ext_links2.int_node.port_buffers22 system.ruby.network.ext_links2.int_node.port_buffers23 system.ruby.network.ext_links2.int_node.port_buffers24 system.ruby.network.ext_links2.int_node.port_buffers25 system.ruby.network.ext_links2.int_node.port_buffers26 system.ruby.network.ext_links2.int_node.port_buffers27 system.ruby.network.ext_links2.int_node.port_buffers28 system.ruby.network.ext_links2.int_node.port_buffers29 system.ruby.network.ext_links2.int_node.port_buffers30 system.ruby.network.ext_links2.int_node.port_buffers31 system.ruby.network.ext_links2.int_node.port_buffers32 system.ruby.network.ext_links2.int_node.port_buffers33 system.ruby.network.ext_links2.int_node.port_buffers34 system.ruby.network.ext_links2.int_node.port_buffers35 system.ruby.network.ext_links2.int_node.port_buffers36 system.ruby.network.ext_links2.int_node.port_buffers37 system.ruby.network.ext_links2.int_node.port_buffers38 system.ruby.network.ext_links2.int_node.port_buffers39 system.ruby.network.ext_links2.int_node.port_buffers40 system.ruby.network.ext_links2.int_node.port_buffers41 system.ruby.network.ext_links2.int_node.port_buffers42 system.ruby.network.ext_links2.int_node.port_buffers43 system.ruby.network.ext_links2.int_node.port_buffers44 system.ruby.network.ext_links2.int_node.port_buffers45 system.ruby.network.ext_links2.int_node.port_buffers46 system.ruby.network.ext_links2.int_node.port_buffers47 system.ruby.network.ext_links2.int_node.port_buffers48 system.ruby.network.ext_links2.int_node.port_buffers49 system.ruby.network.ext_links2.int_node.port_buffers50 system.ruby.network.ext_links2.int_node.port_buffers51 system.ruby.network.ext_links2.int_node.port_buffers52 system.ruby.network.ext_links2.int_node.port_buffers53 system.ruby.network.ext_links2.int_node.port_buffers54 system.ruby.network.ext_links2.int_node.port_buffers55 system.ruby.network.ext_links2.int_node.port_buffers56 system.ruby.network.ext_links2.int_node.port_buffers57 system.ruby.network.ext_links2.int_node.port_buffers58 system.ruby.network.ext_links2.int_node.port_buffers59 system.ruby.network.ext_links2.int_node.port_buffers60 system.ruby.network.ext_links2.int_node.port_buffers61 system.ruby.network.ext_links2.int_node.port_buffers62 system.ruby.network.ext_links2.int_node.port_buffers63 system.ruby.network.ext_links2.int_node.port_buffers64 system.ruby.network.ext_links2.int_node.port_buffers65 system.ruby.network.ext_links2.int_node.port_buffers66 system.ruby.network.ext_links2.int_node.port_buffers67 system.ruby.network.ext_links2.int_node.port_buffers68 system.ruby.network.ext_links2.int_node.port_buffers69 system.ruby.network.ext_links2.int_node.port_buffers70 system.ruby.network.ext_links2.int_node.port_buffers71 system.ruby.network.ext_links2.int_node.port_buffers72 system.ruby.network.ext_links2.int_node.port_buffers73 system.ruby.network.ext_links2.int_node.port_buffers74 system.ruby.network.ext_links2.int_node.port_buffers75 system.ruby.network.ext_links2.int_node.port_buffers76 system.ruby.network.ext_links2.int_node.port_buffers77 system.ruby.network.ext_links2.int_node.port_buffers78 system.ruby.network.ext_links2.int_node.port_buffers79 system.ruby.network.ext_links2.int_node.port_buffers80 system.ruby.network.ext_links2.int_node.port_buffers81 system.ruby.network.ext_links2.int_node.port_buffers82 system.ruby.network.ext_links2.int_node.port_buffers83 system.ruby.network.ext_links2.int_node.port_buffers84 system.ruby.network.ext_links2.int_node.port_buffers85 system.ruby.network.ext_links2.int_node.port_buffers86 system.ruby.network.ext_links2.int_node.port_buffers87 system.ruby.network.ext_links2.int_node.port_buffers88 system.ruby.network.ext_links2.int_node.port_buffers89 system.ruby.network.ext_links2.int_node.port_buffers90 system.ruby.network.ext_links2.int_node.port_buffers91 system.ruby.network.ext_links2.int_node.port_buffers92 system.ruby.network.ext_links2.int_node.port_buffers93 system.ruby.network.ext_links2.int_node.port_buffers94 system.ruby.network.ext_links2.int_node.port_buffers95 system.ruby.network.ext_links2.int_node.port_buffers96 system.ruby.network.ext_links2.int_node.port_buffers97 system.ruby.network.ext_links2.int_node.port_buffers98 system.ruby.network.ext_links2.int_node.port_buffers99
+router_id=1
+virt_nets=10
+
+[system.ruby.network.ext_links2.int_node.port_buffers00]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers01]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers02]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers03]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers04]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers05]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers06]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers07]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers08]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers09]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers10]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers11]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers12]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers13]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers14]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers15]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers16]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers17]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers18]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers19]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers20]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers21]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers22]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers23]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers24]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers25]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers26]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers27]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers28]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers29]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers30]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers31]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers32]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers33]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers34]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers35]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers36]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers37]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers38]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers39]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers40]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers41]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers42]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers43]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers44]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers45]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers46]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers47]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers48]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers49]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers50]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers51]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers52]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers53]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers54]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers55]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers56]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers57]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers58]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers59]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers60]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers61]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers62]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers63]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers64]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers65]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers66]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers67]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers68]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers69]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers70]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers71]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers72]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers73]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers74]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers75]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers76]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers77]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers78]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers79]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers80]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers81]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers82]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers83]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers84]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers85]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers86]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers87]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers88]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers89]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers90]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers91]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers92]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers93]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers94]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers95]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers96]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers97]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers98]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links2.int_node.port_buffers99]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links3]
+type=SimpleExtLink
+bandwidth_factor=32
+eventq_index=0
+ext_node=system.rb_cntrl0
+int_node=system.ruby.network.ext_links2.int_node
+latency=1
+link_id=3
+weight=1
+
+[system.ruby.network.ext_links4]
+type=SimpleExtLink
+children=int_node
+bandwidth_factor=32
+eventq_index=0
+ext_node=system.tcp_cntrl0
+int_node=system.ruby.network.ext_links4.int_node
+latency=1
+link_id=4
+weight=1
+
+[system.ruby.network.ext_links4.int_node]
+type=Switch
+children=port_buffers00 port_buffers01 port_buffers02 port_buffers03 port_buffers04 port_buffers05 port_buffers06 port_buffers07 port_buffers08 port_buffers09 port_buffers10 port_buffers11 port_buffers12 port_buffers13 port_buffers14 port_buffers15 port_buffers16 port_buffers17 port_buffers18 port_buffers19 port_buffers20 port_buffers21 port_buffers22 port_buffers23 port_buffers24 port_buffers25 port_buffers26 port_buffers27 port_buffers28 port_buffers29 port_buffers30 port_buffers31 port_buffers32 port_buffers33 port_buffers34 port_buffers35 port_buffers36 port_buffers37 port_buffers38 port_buffers39 port_buffers40 port_buffers41 port_buffers42 port_buffers43 port_buffers44 port_buffers45 port_buffers46 port_buffers47 port_buffers48 port_buffers49 port_buffers50 port_buffers51 port_buffers52 port_buffers53 port_buffers54 port_buffers55 port_buffers56 port_buffers57 port_buffers58 port_buffers59 port_buffers60 port_buffers61 port_buffers62 port_buffers63 port_buffers64 port_buffers65 port_buffers66 port_buffers67 port_buffers68 port_buffers69 port_buffers70 port_buffers71 port_buffers72 port_buffers73 port_buffers74 port_buffers75 port_buffers76 port_buffers77 port_buffers78 port_buffers79 port_buffers80 port_buffers81 port_buffers82 port_buffers83 port_buffers84 port_buffers85 port_buffers86 port_buffers87 port_buffers88 port_buffers89 port_buffers90 port_buffers91 port_buffers92 port_buffers93 port_buffers94 port_buffers95 port_buffers96 port_buffers97 port_buffers98 port_buffers99
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+port_buffers=system.ruby.network.ext_links4.int_node.port_buffers00 system.ruby.network.ext_links4.int_node.port_buffers01 system.ruby.network.ext_links4.int_node.port_buffers02 system.ruby.network.ext_links4.int_node.port_buffers03 system.ruby.network.ext_links4.int_node.port_buffers04 system.ruby.network.ext_links4.int_node.port_buffers05 system.ruby.network.ext_links4.int_node.port_buffers06 system.ruby.network.ext_links4.int_node.port_buffers07 system.ruby.network.ext_links4.int_node.port_buffers08 system.ruby.network.ext_links4.int_node.port_buffers09 system.ruby.network.ext_links4.int_node.port_buffers10 system.ruby.network.ext_links4.int_node.port_buffers11 system.ruby.network.ext_links4.int_node.port_buffers12 system.ruby.network.ext_links4.int_node.port_buffers13 system.ruby.network.ext_links4.int_node.port_buffers14 system.ruby.network.ext_links4.int_node.port_buffers15 system.ruby.network.ext_links4.int_node.port_buffers16 system.ruby.network.ext_links4.int_node.port_buffers17 system.ruby.network.ext_links4.int_node.port_buffers18 system.ruby.network.ext_links4.int_node.port_buffers19 system.ruby.network.ext_links4.int_node.port_buffers20 system.ruby.network.ext_links4.int_node.port_buffers21 system.ruby.network.ext_links4.int_node.port_buffers22 system.ruby.network.ext_links4.int_node.port_buffers23 system.ruby.network.ext_links4.int_node.port_buffers24 system.ruby.network.ext_links4.int_node.port_buffers25 system.ruby.network.ext_links4.int_node.port_buffers26 system.ruby.network.ext_links4.int_node.port_buffers27 system.ruby.network.ext_links4.int_node.port_buffers28 system.ruby.network.ext_links4.int_node.port_buffers29 system.ruby.network.ext_links4.int_node.port_buffers30 system.ruby.network.ext_links4.int_node.port_buffers31 system.ruby.network.ext_links4.int_node.port_buffers32 system.ruby.network.ext_links4.int_node.port_buffers33 system.ruby.network.ext_links4.int_node.port_buffers34 system.ruby.network.ext_links4.int_node.port_buffers35 system.ruby.network.ext_links4.int_node.port_buffers36 system.ruby.network.ext_links4.int_node.port_buffers37 system.ruby.network.ext_links4.int_node.port_buffers38 system.ruby.network.ext_links4.int_node.port_buffers39 system.ruby.network.ext_links4.int_node.port_buffers40 system.ruby.network.ext_links4.int_node.port_buffers41 system.ruby.network.ext_links4.int_node.port_buffers42 system.ruby.network.ext_links4.int_node.port_buffers43 system.ruby.network.ext_links4.int_node.port_buffers44 system.ruby.network.ext_links4.int_node.port_buffers45 system.ruby.network.ext_links4.int_node.port_buffers46 system.ruby.network.ext_links4.int_node.port_buffers47 system.ruby.network.ext_links4.int_node.port_buffers48 system.ruby.network.ext_links4.int_node.port_buffers49 system.ruby.network.ext_links4.int_node.port_buffers50 system.ruby.network.ext_links4.int_node.port_buffers51 system.ruby.network.ext_links4.int_node.port_buffers52 system.ruby.network.ext_links4.int_node.port_buffers53 system.ruby.network.ext_links4.int_node.port_buffers54 system.ruby.network.ext_links4.int_node.port_buffers55 system.ruby.network.ext_links4.int_node.port_buffers56 system.ruby.network.ext_links4.int_node.port_buffers57 system.ruby.network.ext_links4.int_node.port_buffers58 system.ruby.network.ext_links4.int_node.port_buffers59 system.ruby.network.ext_links4.int_node.port_buffers60 system.ruby.network.ext_links4.int_node.port_buffers61 system.ruby.network.ext_links4.int_node.port_buffers62 system.ruby.network.ext_links4.int_node.port_buffers63 system.ruby.network.ext_links4.int_node.port_buffers64 system.ruby.network.ext_links4.int_node.port_buffers65 system.ruby.network.ext_links4.int_node.port_buffers66 system.ruby.network.ext_links4.int_node.port_buffers67 system.ruby.network.ext_links4.int_node.port_buffers68 system.ruby.network.ext_links4.int_node.port_buffers69 system.ruby.network.ext_links4.int_node.port_buffers70 system.ruby.network.ext_links4.int_node.port_buffers71 system.ruby.network.ext_links4.int_node.port_buffers72 system.ruby.network.ext_links4.int_node.port_buffers73 system.ruby.network.ext_links4.int_node.port_buffers74 system.ruby.network.ext_links4.int_node.port_buffers75 system.ruby.network.ext_links4.int_node.port_buffers76 system.ruby.network.ext_links4.int_node.port_buffers77 system.ruby.network.ext_links4.int_node.port_buffers78 system.ruby.network.ext_links4.int_node.port_buffers79 system.ruby.network.ext_links4.int_node.port_buffers80 system.ruby.network.ext_links4.int_node.port_buffers81 system.ruby.network.ext_links4.int_node.port_buffers82 system.ruby.network.ext_links4.int_node.port_buffers83 system.ruby.network.ext_links4.int_node.port_buffers84 system.ruby.network.ext_links4.int_node.port_buffers85 system.ruby.network.ext_links4.int_node.port_buffers86 system.ruby.network.ext_links4.int_node.port_buffers87 system.ruby.network.ext_links4.int_node.port_buffers88 system.ruby.network.ext_links4.int_node.port_buffers89 system.ruby.network.ext_links4.int_node.port_buffers90 system.ruby.network.ext_links4.int_node.port_buffers91 system.ruby.network.ext_links4.int_node.port_buffers92 system.ruby.network.ext_links4.int_node.port_buffers93 system.ruby.network.ext_links4.int_node.port_buffers94 system.ruby.network.ext_links4.int_node.port_buffers95 system.ruby.network.ext_links4.int_node.port_buffers96 system.ruby.network.ext_links4.int_node.port_buffers97 system.ruby.network.ext_links4.int_node.port_buffers98 system.ruby.network.ext_links4.int_node.port_buffers99
+router_id=2
+virt_nets=10
+
+[system.ruby.network.ext_links4.int_node.port_buffers00]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers01]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers02]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers03]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers04]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers05]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers06]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers07]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers08]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers09]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers10]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers11]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers12]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers13]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers14]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers15]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers16]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers17]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers18]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers19]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers20]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers21]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers22]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers23]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers24]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers25]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers26]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers27]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers28]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers29]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers30]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers31]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers32]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers33]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers34]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers35]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers36]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers37]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers38]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers39]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers40]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers41]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers42]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers43]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers44]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers45]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers46]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers47]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers48]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers49]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers50]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers51]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers52]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers53]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers54]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers55]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers56]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers57]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers58]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers59]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers60]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers61]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers62]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers63]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers64]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers65]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers66]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers67]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers68]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers69]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers70]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers71]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers72]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers73]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers74]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers75]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers76]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers77]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers78]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers79]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers80]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers81]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers82]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers83]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers84]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers85]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers86]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers87]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers88]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers89]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers90]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers91]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers92]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers93]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers94]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers95]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers96]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers97]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers98]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links4.int_node.port_buffers99]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links5]
+type=SimpleExtLink
+bandwidth_factor=32
+eventq_index=0
+ext_node=system.tcp_cntrl1
+int_node=system.ruby.network.ext_links4.int_node
+latency=1
+link_id=5
+weight=1
+
+[system.ruby.network.ext_links6]
+type=SimpleExtLink
+bandwidth_factor=32
+eventq_index=0
+ext_node=system.sqc_cntrl0
+int_node=system.ruby.network.ext_links4.int_node
+latency=1
+link_id=6
+weight=1
+
+[system.ruby.network.ext_links7]
+type=SimpleExtLink
+bandwidth_factor=32
+eventq_index=0
+ext_node=system.tcc_cntrl0
+int_node=system.ruby.network.ext_links4.int_node
+latency=1
+link_id=7
+weight=1
+
+[system.ruby.network.ext_links8]
+type=SimpleExtLink
+bandwidth_factor=32
+eventq_index=0
+ext_node=system.tcc_rb_cntrl0
+int_node=system.ruby.network.ext_links4.int_node
+latency=1
+link_id=8
+weight=1
+
+[system.ruby.network.int_link_buffers00]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers01]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers02]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers03]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers04]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers05]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers06]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers07]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers08]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers09]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers10]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers11]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers12]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers13]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers14]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers15]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers16]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers17]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers18]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers19]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers20]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers21]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers22]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers23]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers24]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers25]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers26]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers27]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers28]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers29]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers30]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers31]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers32]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers33]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers34]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers35]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers36]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers37]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers38]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers39]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_links0]
+type=SimpleIntLink
+bandwidth_factor=32
+eventq_index=0
+latency=1
+link_id=0
+node_a=system.ruby.network.ext_links0.int_node
+node_b=system.ruby.network.ext_links2.int_node
+weight=1
+
+[system.ruby.network.int_links1]
+type=SimpleIntLink
+bandwidth_factor=32
+eventq_index=0
+latency=1
+link_id=1
+node_a=system.ruby.network.ext_links0.int_node
+node_b=system.ruby.network.ext_links4.int_node
+weight=1
+
+[system.ruby.phys_mem]
+type=SimpleMemory
+bandwidth=73.000000
+clk_domain=system.ruby.clk_domain
+conf_table_reported=true
+eventq_index=0
+in_addr_map=false
+latency=30000
+latency_var=0
+null=false
+range=0:536870911
+
+[system.sqc_cntrl0]
+type=SQC_Controller
+children=L1cache mandatoryQueue probeToSQC requestFromSQC responseToSQC sequencer
+L1cache=system.sqc_cntrl0.L1cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+eventq_index=0
+issue_latency=80
+l2_hit_latency=18
+mandatoryQueue=system.sqc_cntrl0.mandatoryQueue
+number_of_TBEs=256
+probeToSQC=system.sqc_cntrl0.probeToSQC
+recycle_latency=10
+requestFromSQC=system.sqc_cntrl0.requestFromSQC
+responseToSQC=system.sqc_cntrl0.responseToSQC
+ruby_system=system.ruby
+sequencer=system.sqc_cntrl0.sequencer
+system=system
+transitions_per_cycle=32
+version=0
+
+[system.sqc_cntrl0.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=8
+eventq_index=0
+is_icache=false
+replacement_policy=system.sqc_cntrl0.L1cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=32768
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=8
+
+[system.sqc_cntrl0.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=32768
+
+[system.sqc_cntrl0.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.sqc_cntrl0.probeToSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[11]
+
+[system.sqc_cntrl0.requestFromSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[11]
+
+[system.sqc_cntrl0.responseToSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[12]
+
+[system.sqc_cntrl0.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.sqc_cntrl0.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.sqc_cntrl0.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=false
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=6
+slave=system.cpu1.CUs0.sqc_port system.cpu1.CUs1.sqc_port
+
+[system.sqc_coalescer]
+type=TLBCoalescer
+children=clk_domain
+clk_domain=system.sqc_coalescer.clk_domain
+coalescingWindow=1
+disableCoalescing=false
+eventq_index=0
+probesPerCycle=2
+master=system.sqc_tlb.slave[0]
+slave=system.cpu1.CUs0.sqc_tlb_port system.cpu1.CUs1.sqc_tlb_port
+
+[system.sqc_coalescer.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.sqc_coalescer.clk_domain.voltage_domain
+
+[system.sqc_coalescer.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.sqc_tlb]
+type=X86GPUTLB
+children=clk_domain
+accessDistance=false
+allocationPolicy=true
+assoc=32
+clk_domain=system.sqc_tlb.clk_domain
+eventq_index=0
+hitLatency=1
+maxOutstandingReqs=64
+missLatency1=5
+missLatency2=750
+size=32
+master=system.l2_coalescer.slave[0]
+slave=system.sqc_coalescer.master[0]
+
+[system.sqc_tlb.clk_domain]
+type=SrcClockDomain
+children=voltage_domain
+clock=1000
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.sqc_tlb.clk_domain.voltage_domain
+
+[system.sqc_tlb.clk_domain.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
+[system.sys_port_proxy]
+type=RubyPortProxy
+clk_domain=system.clk_domain
+eventq_index=0
+is_cpu_sequencer=true
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_ruby_tester=false
+version=0
+slave=system.system_port
+
+[system.tcc_cntrl0]
+type=TCC_Controller
+children=L2cache probeFromNB requestFromTCP requestToNB responseFromNB responseToCore responseToNB triggerQueue unblockToNB
+L2cache=system.tcc_cntrl0.L2cache
+WB=false
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+eventq_index=0
+l2_request_latency=1
+l2_response_latency=16
+number_of_TBEs=5120
+probeFromNB=system.tcc_cntrl0.probeFromNB
+recycle_latency=10
+regionBufferNum=1
+requestFromTCP=system.tcc_cntrl0.requestFromTCP
+requestToNB=system.tcc_cntrl0.requestToNB
+responseFromNB=system.tcc_cntrl0.responseFromNB
+responseToCore=system.tcc_cntrl0.responseToCore
+responseToNB=system.tcc_cntrl0.responseToNB
+ruby_system=system.ruby
+system=system
+transitions_per_cycle=32
+triggerQueue=system.tcc_cntrl0.triggerQueue
+unblockToNB=system.tcc_cntrl0.unblockToNB
+version=0
+
+[system.tcc_cntrl0.L2cache]
+type=RubyCache
+children=replacement_policy
+assoc=16
+block_size=0
+dataAccessLatency=8
+dataArrayBanks=256
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcc_cntrl0.L2cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=2097152
+start_index_bit=6
+tagAccessLatency=2
+tagArrayBanks=256
+
+[system.tcc_cntrl0.L2cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=16
+block_size=64
+eventq_index=0
+size=2097152
+
+[system.tcc_cntrl0.probeFromNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[14]
+
+[system.tcc_cntrl0.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[13]
+
+[system.tcc_cntrl0.requestToNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[13]
+
+[system.tcc_cntrl0.responseFromNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[15]
+
+[system.tcc_cntrl0.responseToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[12]
+
+[system.tcc_cntrl0.responseToNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[14]
+
+[system.tcc_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.tcc_cntrl0.unblockToNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[15]
+
+[system.tcc_rb_cntrl0]
+type=RegionBuffer_Controller
+children=cacheMemory notifyFromRegionDir probeFromRegionDir requestFromCore requestToNetwork responseFromCore responseToRegDir triggerQueue unblockFromDir
+TCC_select_num_bits=0
+blocksPerRegion=16
+buffer_size=0
+cacheMemory=system.tcc_rb_cntrl0.cacheMemory
+clk_domain=system.clk_domain
+cluster_id=0
+eventq_index=0
+isOnCPU=false
+nextEvictLatency=1
+noTCCdir=true
+notifyFromRegionDir=system.tcc_rb_cntrl0.notifyFromRegionDir
+number_of_TBEs=5120
+probeFromRegionDir=system.tcc_rb_cntrl0.probeFromRegionDir
+recycle_latency=10
+requestFromCore=system.tcc_rb_cntrl0.requestFromCore
+requestToNetwork=system.tcc_rb_cntrl0.requestToNetwork
+responseFromCore=system.tcc_rb_cntrl0.responseFromCore
+responseToRegDir=system.tcc_rb_cntrl0.responseToRegDir
+ruby_system=system.ruby
+system=system
+toDirLatency=60
+toRegionDirLatency=120
+transitions_per_cycle=32
+triggerQueue=system.tcc_rb_cntrl0.triggerQueue
+unblockFromDir=system.tcc_rb_cntrl0.unblockFromDir
+version=1
+
+[system.tcc_rb_cntrl0.cacheMemory]
+type=RubyCache
+children=replacement_policy
+assoc=4
+block_size=1024
+dataAccessLatency=1
+dataArrayBanks=64
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcc_rb_cntrl0.cacheMemory.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=1048576
+start_index_bit=10
+tagAccessLatency=1
+tagArrayBanks=64
+
+[system.tcc_rb_cntrl0.cacheMemory.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=4
+block_size=64
+eventq_index=0
+size=1048576
+
+[system.tcc_rb_cntrl0.notifyFromRegionDir]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[18]
+
+[system.tcc_rb_cntrl0.probeFromRegionDir]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[19]
+
+[system.tcc_rb_cntrl0.requestFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[16]
+
+[system.tcc_rb_cntrl0.requestToNetwork]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[16]
+
+[system.tcc_rb_cntrl0.responseFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[17]
+
+[system.tcc_rb_cntrl0.responseToRegDir]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[17]
+
+[system.tcc_rb_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.tcc_rb_cntrl0.unblockFromDir]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[20]
+
+[system.tcp_cntrl0]
+type=TCP_Controller
+children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore
+L1cache=system.tcp_cntrl0.L1cache
+TCC_select_num_bits=0
+WB=false
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+coalescer=system.tcp_cntrl0.coalescer
+disableL1=false
+eventq_index=0
+issue_latency=1
+l2_hit_latency=18
+mandatoryQueue=system.tcp_cntrl0.mandatoryQueue
+number_of_TBEs=2560
+probeToTCP=system.tcp_cntrl0.probeToTCP
+recycle_latency=10
+requestFromTCP=system.tcp_cntrl0.requestFromTCP
+responseFromTCP=system.tcp_cntrl0.responseFromTCP
+responseToTCP=system.tcp_cntrl0.responseToTCP
+ruby_system=system.ruby
+sequencer=system.tcp_cntrl0.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.tcp_cntrl0.unblockFromCore
+use_seq_not_coal=false
+version=0
+
+[system.tcp_cntrl0.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=16
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcp_cntrl0.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=16384
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=16
+
+[system.tcp_cntrl0.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=16
+block_size=64
+eventq_index=0
+size=16384
+
+[system.tcp_cntrl0.coalescer]
+type=VIPERCoalescer
+assume_rfo=false
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl0.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl0.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_inv_per_cycle=32
+max_outstanding_requests=2560
+max_wb_per_cycle=32
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=false
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=2
+slave=system.cpu1.CUs0.memory_port[0] system.cpu1.CUs0.memory_port[1] system.cpu1.CUs0.memory_port[2] system.cpu1.CUs0.memory_port[3] system.cpu1.CUs0.memory_port[4] system.cpu1.CUs0.memory_port[5] system.cpu1.CUs0.memory_port[6] system.cpu1.CUs0.memory_port[7] system.cpu1.CUs0.memory_port[8] system.cpu1.CUs0.memory_port[9] system.cpu1.CUs0.memory_port[10] system.cpu1.CUs0.memory_port[11] system.cpu1.CUs0.memory_port[12] system.cpu1.CUs0.memory_port[13] system.cpu1.CUs0.memory_port[14] system.cpu1.CUs0.memory_port[15] system.cpu1.CUs0.memory_port[16] system.cpu1.CUs0.memory_port[17] system.cpu1.CUs0.memory_port[18] system.cpu1.CUs0.memory_port[19] system.cpu1.CUs0.memory_port[20] system.cpu1.CUs0.memory_port[21] system.cpu1.CUs0.memory_port[22] system.cpu1.CUs0.memory_port[23] system.cpu1.CUs0.memory_port[24] system.cpu1.CUs0.memory_port[25] system.cpu1.CUs0.memory_port[26] system.cpu1.CUs0.memory_port[27] system.cpu1.CUs0.memory_port[28] system.cpu1.CUs0.memory_port[29] system.cpu1.CUs0.memory_port[30] system.cpu1.CUs0.memory_port[31] system.cpu1.CUs0.memory_port[32] system.cpu1.CUs0.memory_port[33] system.cpu1.CUs0.memory_port[34] system.cpu1.CUs0.memory_port[35] system.cpu1.CUs0.memory_port[36] system.cpu1.CUs0.memory_port[37] system.cpu1.CUs0.memory_port[38] system.cpu1.CUs0.memory_port[39] system.cpu1.CUs0.memory_port[40] system.cpu1.CUs0.memory_port[41] system.cpu1.CUs0.memory_port[42] system.cpu1.CUs0.memory_port[43] system.cpu1.CUs0.memory_port[44] system.cpu1.CUs0.memory_port[45] system.cpu1.CUs0.memory_port[46] system.cpu1.CUs0.memory_port[47] system.cpu1.CUs0.memory_port[48] system.cpu1.CUs0.memory_port[49] system.cpu1.CUs0.memory_port[50] system.cpu1.CUs0.memory_port[51] system.cpu1.CUs0.memory_port[52] system.cpu1.CUs0.memory_port[53] system.cpu1.CUs0.memory_port[54] system.cpu1.CUs0.memory_port[55] system.cpu1.CUs0.memory_port[56] system.cpu1.CUs0.memory_port[57] system.cpu1.CUs0.memory_port[58] system.cpu1.CUs0.memory_port[59] system.cpu1.CUs0.memory_port[60] system.cpu1.CUs0.memory_port[61] system.cpu1.CUs0.memory_port[62] system.cpu1.CUs0.memory_port[63]
+
+[system.tcp_cntrl0.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.tcp_cntrl0.probeToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[7]
+
+[system.tcp_cntrl0.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[5]
+
+[system.tcp_cntrl0.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[6]
+
+[system.tcp_cntrl0.responseToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[8]
+
+[system.tcp_cntrl0.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl0.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl0.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=3
+
+[system.tcp_cntrl0.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[7]
+
+[system.tcp_cntrl1]
+type=TCP_Controller
+children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore
+L1cache=system.tcp_cntrl1.L1cache
+TCC_select_num_bits=0
+WB=false
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+coalescer=system.tcp_cntrl1.coalescer
+disableL1=false
+eventq_index=0
+issue_latency=1
+l2_hit_latency=18
+mandatoryQueue=system.tcp_cntrl1.mandatoryQueue
+number_of_TBEs=2560
+probeToTCP=system.tcp_cntrl1.probeToTCP
+recycle_latency=10
+requestFromTCP=system.tcp_cntrl1.requestFromTCP
+responseFromTCP=system.tcp_cntrl1.responseFromTCP
+responseToTCP=system.tcp_cntrl1.responseToTCP
+ruby_system=system.ruby
+sequencer=system.tcp_cntrl1.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.tcp_cntrl1.unblockFromCore
+use_seq_not_coal=false
+version=1
+
+[system.tcp_cntrl1.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=16
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcp_cntrl1.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=16384
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=16
+
+[system.tcp_cntrl1.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=16
+block_size=64
+eventq_index=0
+size=16384
+
+[system.tcp_cntrl1.coalescer]
+type=VIPERCoalescer
+assume_rfo=false
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl1.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl1.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_inv_per_cycle=32
+max_outstanding_requests=2560
+max_wb_per_cycle=32
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=false
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=4
+slave=system.cpu1.CUs1.memory_port[0] system.cpu1.CUs1.memory_port[1] system.cpu1.CUs1.memory_port[2] system.cpu1.CUs1.memory_port[3] system.cpu1.CUs1.memory_port[4] system.cpu1.CUs1.memory_port[5] system.cpu1.CUs1.memory_port[6] system.cpu1.CUs1.memory_port[7] system.cpu1.CUs1.memory_port[8] system.cpu1.CUs1.memory_port[9] system.cpu1.CUs1.memory_port[10] system.cpu1.CUs1.memory_port[11] system.cpu1.CUs1.memory_port[12] system.cpu1.CUs1.memory_port[13] system.cpu1.CUs1.memory_port[14] system.cpu1.CUs1.memory_port[15] system.cpu1.CUs1.memory_port[16] system.cpu1.CUs1.memory_port[17] system.cpu1.CUs1.memory_port[18] system.cpu1.CUs1.memory_port[19] system.cpu1.CUs1.memory_port[20] system.cpu1.CUs1.memory_port[21] system.cpu1.CUs1.memory_port[22] system.cpu1.CUs1.memory_port[23] system.cpu1.CUs1.memory_port[24] system.cpu1.CUs1.memory_port[25] system.cpu1.CUs1.memory_port[26] system.cpu1.CUs1.memory_port[27] system.cpu1.CUs1.memory_port[28] system.cpu1.CUs1.memory_port[29] system.cpu1.CUs1.memory_port[30] system.cpu1.CUs1.memory_port[31] system.cpu1.CUs1.memory_port[32] system.cpu1.CUs1.memory_port[33] system.cpu1.CUs1.memory_port[34] system.cpu1.CUs1.memory_port[35] system.cpu1.CUs1.memory_port[36] system.cpu1.CUs1.memory_port[37] system.cpu1.CUs1.memory_port[38] system.cpu1.CUs1.memory_port[39] system.cpu1.CUs1.memory_port[40] system.cpu1.CUs1.memory_port[41] system.cpu1.CUs1.memory_port[42] system.cpu1.CUs1.memory_port[43] system.cpu1.CUs1.memory_port[44] system.cpu1.CUs1.memory_port[45] system.cpu1.CUs1.memory_port[46] system.cpu1.CUs1.memory_port[47] system.cpu1.CUs1.memory_port[48] system.cpu1.CUs1.memory_port[49] system.cpu1.CUs1.memory_port[50] system.cpu1.CUs1.memory_port[51] system.cpu1.CUs1.memory_port[52] system.cpu1.CUs1.memory_port[53] system.cpu1.CUs1.memory_port[54] system.cpu1.CUs1.memory_port[55] system.cpu1.CUs1.memory_port[56] system.cpu1.CUs1.memory_port[57] system.cpu1.CUs1.memory_port[58] system.cpu1.CUs1.memory_port[59] system.cpu1.CUs1.memory_port[60] system.cpu1.CUs1.memory_port[61] system.cpu1.CUs1.memory_port[62] system.cpu1.CUs1.memory_port[63]
+
+[system.tcp_cntrl1.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.tcp_cntrl1.probeToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[9]
+
+[system.tcp_cntrl1.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[8]
+
+[system.tcp_cntrl1.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[9]
+
+[system.tcp_cntrl1.responseToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[10]
+
+[system.tcp_cntrl1.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl1.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl1.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=5
+
+[system.tcp_cntrl1.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[10]
+
+[system.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/simerr b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/simerr
new file mode 100755
index 000000000..1e2b8911e
--- /dev/null
+++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/simerr
@@ -0,0 +1,5 @@
+warn: system.ruby.network adopting orphan SimObject param 'int_links'
+warn: system.ruby.network adopting orphan SimObject param 'ext_links'
+warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (512 Mbytes)
+warn: Sockets disabled, not accepting gdb connections
+warn: Replacement policy updates recently became the responsibility of SLICC state machines. Make sure to setMRU() near callbacks in .sm files!
diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/simout b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/simout
new file mode 100755
index 000000000..8e5806b46
--- /dev/null
+++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/simout
@@ -0,0 +1,21 @@
+gem5 Simulator System.  http://gem5.org
+gem5 is copyrighted software; use the --copyright option for details.
+
+gem5 compiled Jan 19 2016 13:45:43
+gem5 started Jan 19 2016 13:46:17
+gem5 executing on zizzer, pid 51290
+command line: build/HSAIL_X86/gem5.opt -d build/HSAIL_X86/tests/opt/quick/se/04.gpu/x86/linux/gpu-ruby-GPU_VIPER_Region -re /z/atgutier/gem5/gem5-commit/tests/run.py build/HSAIL_X86/tests/opt/quick/se/04.gpu/x86/linux/gpu-ruby-GPU_VIPER_Region
+
+Using GPU kernel code file(s) /dist/m5/regression/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm
+Global frequency set at 1000000000000 ticks per second
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+Forcing maxCoalescedReqs to 32 (TLB assoc.) 
+info: Entering event queue @ 0.  Starting simulation...
+keys = 0x7b2bc0, &keys = 0x798998, keys[0] = 23
+the gpu says:
+elloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloelloe
+Exiting @ tick 468854500 because target called exit()
diff --git a/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/stats.txt b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/stats.txt
new file mode 100644
index 000000000..6fbd50886
--- /dev/null
+++ b/tests/quick/se/04.gpu/ref/x86/linux/gpu-ruby-GPU_VIPER_Region/stats.txt
@@ -0,0 +1,3418 @@
+
+---------- Begin Simulation Statistics ----------
+sim_seconds                                  0.000469                       # Number of seconds simulated
+sim_ticks                                   468854500                       # Number of ticks simulated
+final_tick                                  468854500                       # Number of ticks from beginning of simulation (restored from checkpoints and never reset)
+sim_freq                                 1000000000000                       # Frequency of simulated ticks
+host_inst_rate                                  67943                       # Simulator instruction rate (inst/s)
+host_op_rate                                   139717                       # Simulator op (including micro ops) rate (op/s)
+host_tick_rate                              475693968                       # Simulator tick rate (ticks/s)
+host_mem_usage                                1301796                       # Number of bytes of host memory used
+host_seconds                                     0.99                       # Real time elapsed on the host
+sim_insts                                       66963                       # Number of instructions simulated
+sim_ops                                        137705                       # Number of ops (including micro ops) simulated
+system.voltage_domain.voltage                       1                       # Voltage in Volts
+system.clk_domain.clock                          1000                       # Clock period in ticks
+system.mem_ctrls.bytes_read::dir_cntrl0        100032                       # Number of bytes read from this memory
+system.mem_ctrls.bytes_read::total             100032                       # Number of bytes read from this memory
+system.mem_ctrls.num_reads::dir_cntrl0           1563                       # Number of read requests responded to by this memory
+system.mem_ctrls.num_reads::total                1563                       # Number of read requests responded to by this memory
+system.mem_ctrls.bw_read::dir_cntrl0        213354036                       # Total read bandwidth from this memory (bytes/s)
+system.mem_ctrls.bw_read::total             213354036                       # Total read bandwidth from this memory (bytes/s)
+system.mem_ctrls.bw_total::dir_cntrl0       213354036                       # Total bandwidth to/from this memory (bytes/s)
+system.mem_ctrls.bw_total::total            213354036                       # Total bandwidth to/from this memory (bytes/s)
+system.mem_ctrls.readReqs                        1563                       # Number of read requests accepted
+system.mem_ctrls.writeReqs                          0                       # Number of write requests accepted
+system.mem_ctrls.readBursts                      1563                       # Number of DRAM read bursts, including those serviced by the write queue
+system.mem_ctrls.writeBursts                        0                       # Number of DRAM write bursts, including those merged in the write queue
+system.mem_ctrls.bytesReadDRAM                 100032                       # Total number of bytes read from DRAM
+system.mem_ctrls.bytesReadWrQ                       0                       # Total number of bytes read from write queue
+system.mem_ctrls.bytesWritten                       0                       # Total number of bytes written to DRAM
+system.mem_ctrls.bytesReadSys                  100032                       # Total read bytes from the system interface side
+system.mem_ctrls.bytesWrittenSys                    0                       # Total written bytes from the system interface side
+system.mem_ctrls.servicedByWrQ                      0                       # Number of DRAM read bursts serviced by the write queue
+system.mem_ctrls.mergedWrBursts                     0                       # Number of DRAM write bursts merged with an existing one
+system.mem_ctrls.neitherReadNorWriteReqs            0                       # Number of requests that are neither read nor write
+system.mem_ctrls.perBankRdBursts::0               122                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::1               192                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::2                93                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::3                44                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::4                61                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::5                79                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::6                52                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::7                42                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::8                54                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::9                56                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::10              183                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::11               90                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::12              225                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::13              125                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::14               51                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::15               94                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::0                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::1                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::2                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::3                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::4                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::5                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::6                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::7                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::8                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::9                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::10                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::11                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::12                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::13                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::14                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::15                0                       # Per bank write bursts
+system.mem_ctrls.numRdRetry                         0                       # Number of times read queue was full causing retry
+system.mem_ctrls.numWrRetry                         0                       # Number of times write queue was full causing retry
+system.mem_ctrls.totGap                     468627000                       # Total gap between requests
+system.mem_ctrls.readPktSize::0                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::1                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::2                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::3                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::4                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::5                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::6                  1563                       # Read request sizes (log2)
+system.mem_ctrls.writePktSize::0                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::1                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::2                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::3                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::4                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::5                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::6                    0                       # Write request sizes (log2)
+system.mem_ctrls.rdQLenPdf::0                    1548                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::1                       4                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::2                       2                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::3                       2                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::4                       2                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::5                       2                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::6                       2                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::7                       1                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::8                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::9                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::10                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::11                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::12                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::13                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::14                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::15                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::16                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::17                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::18                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::19                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::20                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::21                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::22                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::23                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::24                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::25                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::26                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::27                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::28                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::29                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::30                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::31                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::0                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::1                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::2                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::3                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::4                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::5                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::6                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::7                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::8                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::9                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::10                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::11                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::12                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::13                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::14                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::15                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::16                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::17                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::18                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::19                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::20                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::21                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::22                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::23                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::24                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::25                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::26                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::27                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::28                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::29                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::30                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::31                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::32                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::33                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::34                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::35                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::36                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::37                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::38                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::39                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::40                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::41                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::42                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::43                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::44                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::45                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::46                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::47                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::48                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::49                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::50                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::51                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::52                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::53                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::54                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::55                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::56                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::57                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::58                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::59                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::60                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::61                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::62                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::63                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.bytesPerActivate::samples          450                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::mean    221.297778                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::gmean   151.217299                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::stdev   224.192300                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::0-127          165     36.67%     36.67% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::128-255          148     32.89%     69.56% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::256-383           55     12.22%     81.78% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::384-511           28      6.22%     88.00% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::512-639           19      4.22%     92.22% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::640-767           11      2.44%     94.67% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::768-895            8      1.78%     96.44% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::896-1023            6      1.33%     97.78% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::1024-1151           10      2.22%    100.00% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::total          450                       # Bytes accessed per row activation
+system.mem_ctrls.totQLat                     14130749                       # Total ticks spent queuing
+system.mem_ctrls.totMemAccLat                43436999                       # Total ticks spent from burst creation until serviced by the DRAM
+system.mem_ctrls.totBusLat                    7815000                       # Total ticks spent in databus transfers
+system.mem_ctrls.avgQLat                      9040.79                       # Average queueing delay per DRAM burst
+system.mem_ctrls.avgBusLat                    5000.00                       # Average bus latency per DRAM burst
+system.mem_ctrls.avgMemAccLat                27790.79                       # Average memory access latency per DRAM burst
+system.mem_ctrls.avgRdBW                       213.35                       # Average DRAM read bandwidth in MiByte/s
+system.mem_ctrls.avgWrBW                         0.00                       # Average achieved write bandwidth in MiByte/s
+system.mem_ctrls.avgRdBWSys                    213.35                       # Average system read bandwidth in MiByte/s
+system.mem_ctrls.avgWrBWSys                      0.00                       # Average system write bandwidth in MiByte/s
+system.mem_ctrls.peakBW                      12800.00                       # Theoretical peak bandwidth in MiByte/s
+system.mem_ctrls.busUtil                         1.67                       # Data bus utilization in percentage
+system.mem_ctrls.busUtilRead                     1.67                       # Data bus utilization in percentage for reads
+system.mem_ctrls.busUtilWrite                    0.00                       # Data bus utilization in percentage for writes
+system.mem_ctrls.avgRdQLen                       1.01                       # Average read queue length when enqueuing
+system.mem_ctrls.avgWrQLen                       0.00                       # Average write queue length when enqueuing
+system.mem_ctrls.readRowHits                     1109                       # Number of row buffer hits during reads
+system.mem_ctrls.writeRowHits                       0                       # Number of row buffer hits during writes
+system.mem_ctrls.readRowHitRate                 70.95                       # Row buffer hit rate for reads
+system.mem_ctrls.writeRowHitRate                  nan                       # Row buffer hit rate for writes
+system.mem_ctrls.avgGap                     299825.34                       # Average gap between requests
+system.mem_ctrls.pageHitRate                    70.95                       # Row buffer hit rate, read and write combined
+system.mem_ctrls_0.actEnergy                  1300320                       # Energy for activate commands per rank (pJ)
+system.mem_ctrls_0.preEnergy                   709500                       # Energy for precharge commands per rank (pJ)
+system.mem_ctrls_0.readEnergy                 5335200                       # Energy for read commands per rank (pJ)
+system.mem_ctrls_0.writeEnergy                      0                       # Energy for write commands per rank (pJ)
+system.mem_ctrls_0.refreshEnergy             30513600                       # Energy for refresh commands per rank (pJ)
+system.mem_ctrls_0.actBackEnergy            265391145                       # Energy for active background per rank (pJ)
+system.mem_ctrls_0.preBackEnergy             47661750                       # Energy for precharge background per rank (pJ)
+system.mem_ctrls_0.totalEnergy              350911515                       # Total energy per rank (pJ)
+system.mem_ctrls_0.averagePower            750.717244                       # Core power per rank (mW)
+system.mem_ctrls_0.memoryStateTime::IDLE     79008000                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::REF      15600000                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::PRE_PDN            0                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::ACT     374147000                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::ACT_PDN            0                       # Time in different power states
+system.mem_ctrls_1.actEnergy                  2101680                       # Energy for activate commands per rank (pJ)
+system.mem_ctrls_1.preEnergy                  1146750                       # Energy for precharge commands per rank (pJ)
+system.mem_ctrls_1.readEnergy                 6801600                       # Energy for read commands per rank (pJ)
+system.mem_ctrls_1.writeEnergy                      0                       # Energy for write commands per rank (pJ)
+system.mem_ctrls_1.refreshEnergy             30513600                       # Energy for refresh commands per rank (pJ)
+system.mem_ctrls_1.actBackEnergy            276170130                       # Energy for active background per rank (pJ)
+system.mem_ctrls_1.preBackEnergy             38206500                       # Energy for precharge background per rank (pJ)
+system.mem_ctrls_1.totalEnergy              354940260                       # Total energy per rank (pJ)
+system.mem_ctrls_1.averagePower            759.336079                       # Core power per rank (mW)
+system.mem_ctrls_1.memoryStateTime::IDLE     61948750                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::REF      15600000                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::PRE_PDN            0                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::ACT     389900000                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::ACT_PDN            0                       # Time in different power states
+system.ruby.clk_domain.clock                      500                       # Clock period in ticks
+system.ruby.phys_mem.bytes_read::cpu0.inst       696760                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_read::cpu0.data       119832                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_read::cpu1.CUs0.ComputeUnit         3280                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_read::cpu1.CUs1.ComputeUnit         3280                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_read::total         823152                       # Number of bytes read from this memory
+system.ruby.phys_mem.bytes_inst_read::cpu0.inst       696760                       # Number of instructions bytes read from this memory
+system.ruby.phys_mem.bytes_inst_read::cpu1.CUs0.ComputeUnit         2000                       # Number of instructions bytes read from this memory
+system.ruby.phys_mem.bytes_inst_read::cpu1.CUs1.ComputeUnit         2000                       # Number of instructions bytes read from this memory
+system.ruby.phys_mem.bytes_inst_read::total       700760                       # Number of instructions bytes read from this memory
+system.ruby.phys_mem.bytes_written::cpu0.data        72767                       # Number of bytes written to this memory
+system.ruby.phys_mem.bytes_written::cpu1.CUs0.ComputeUnit          256                       # Number of bytes written to this memory
+system.ruby.phys_mem.bytes_written::cpu1.CUs1.ComputeUnit          256                       # Number of bytes written to this memory
+system.ruby.phys_mem.bytes_written::total        73279                       # Number of bytes written to this memory
+system.ruby.phys_mem.num_reads::cpu0.inst        87095                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_reads::cpu0.data        16686                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_reads::cpu1.CUs0.ComputeUnit          555                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_reads::cpu1.CUs1.ComputeUnit          555                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_reads::total          104891                       # Number of read requests responded to by this memory
+system.ruby.phys_mem.num_writes::cpu0.data        10422                       # Number of write requests responded to by this memory
+system.ruby.phys_mem.num_writes::cpu1.CUs0.ComputeUnit          256                       # Number of write requests responded to by this memory
+system.ruby.phys_mem.num_writes::cpu1.CUs1.ComputeUnit          256                       # Number of write requests responded to by this memory
+system.ruby.phys_mem.num_writes::total          10934                       # Number of write requests responded to by this memory
+system.ruby.phys_mem.bw_read::cpu0.inst    1486090034                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_read::cpu0.data     255584622                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_read::cpu1.CUs0.ComputeUnit      6995774                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_read::cpu1.CUs1.ComputeUnit      6995774                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_read::total        1755666203                       # Total read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_inst_read::cpu0.inst   1486090034                       # Instruction read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_inst_read::cpu1.CUs0.ComputeUnit      4265716                       # Instruction read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_inst_read::cpu1.CUs1.ComputeUnit      4265716                       # Instruction read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_inst_read::total   1494621466                       # Instruction read bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_write::cpu0.data    155201667                       # Write bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_write::cpu1.CUs0.ComputeUnit       546012                       # Write bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_write::cpu1.CUs1.ComputeUnit       546012                       # Write bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_write::total        156293690                       # Write bandwidth from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::cpu0.inst   1486090034                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::cpu0.data    410786289                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::cpu1.CUs0.ComputeUnit      7541785                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::cpu1.CUs1.ComputeUnit      7541785                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.phys_mem.bw_total::total       1911959894                       # Total bandwidth to/from this memory (bytes/s)
+system.ruby.outstanding_req_hist::bucket_size            1                      
+system.ruby.outstanding_req_hist::max_bucket            9                      
+system.ruby.outstanding_req_hist::samples       114203                      
+system.ruby.outstanding_req_hist::mean       1.000035                      
+system.ruby.outstanding_req_hist::gmean      1.000024                      
+system.ruby.outstanding_req_hist::stdev      0.005918                      
+system.ruby.outstanding_req_hist         |           0      0.00%      0.00% |      114199    100.00%    100.00% |           4      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.outstanding_req_hist::total        114203                      
+system.ruby.latency_hist::bucket_size              64                      
+system.ruby.latency_hist::max_bucket              639                      
+system.ruby.latency_hist::samples              114203                      
+system.ruby.latency_hist::mean               3.070988                      
+system.ruby.latency_hist::gmean              1.072272                      
+system.ruby.latency_hist::stdev             18.192328                      
+system.ruby.latency_hist                 |      112654     98.64%     98.64% |          11      0.01%     98.65% |        1238      1.08%     99.74% |         266      0.23%     99.97% |          14      0.01%     99.98% |          12      0.01%     99.99% |           7      0.01%    100.00% |           1      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.latency_hist::total                114203                      
+system.ruby.hit_latency_hist::bucket_size           64                      
+system.ruby.hit_latency_hist::max_bucket          639                      
+system.ruby.hit_latency_hist::samples            1549                      
+system.ruby.hit_latency_hist::mean         152.827631                      
+system.ruby.hit_latency_hist::gmean        149.009432                      
+system.ruby.hit_latency_hist::stdev         40.628532                      
+system.ruby.hit_latency_hist             |           0      0.00%      0.00% |          11      0.71%      0.71% |        1238     79.92%     80.63% |         266     17.17%     97.81% |          14      0.90%     98.71% |          12      0.77%     99.48% |           7      0.45%     99.94% |           1      0.06%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.hit_latency_hist::total              1549                      
+system.ruby.miss_latency_hist::bucket_size            2                      
+system.ruby.miss_latency_hist::max_bucket           19                      
+system.ruby.miss_latency_hist::samples         112654                      
+system.ruby.miss_latency_hist::mean          1.011824                      
+system.ruby.miss_latency_hist::gmean         1.001936                      
+system.ruby.miss_latency_hist::stdev         0.461184                      
+system.ruby.miss_latency_hist            |      112580     99.93%     99.93% |           0      0.00%     99.93% |           0      0.00%     99.93% |           0      0.00%     99.93% |           0      0.00%     99.93% |           0      0.00%     99.93% |           0      0.00%     99.93% |           0      0.00%     99.93% |           0      0.00%     99.93% |          74      0.07%    100.00%
+system.ruby.miss_latency_hist::total           112654                      
+system.ruby.L1Cache.incomplete_times           112580                      
+system.ruby.L2Cache.incomplete_times               74                      
+system.cp_cntrl0.L1D0cache.demand_hits              0                       # Number of cache demand hits
+system.cp_cntrl0.L1D0cache.demand_misses         1556                       # Number of cache demand misses
+system.cp_cntrl0.L1D0cache.demand_accesses         1556                       # Number of cache demand accesses
+system.cp_cntrl0.L1D0cache.num_data_array_reads        16142                       # number of data array reads
+system.cp_cntrl0.L1D0cache.num_data_array_writes        11998                       # number of data array writes
+system.cp_cntrl0.L1D0cache.num_tag_array_reads        27136                       # number of tag array reads
+system.cp_cntrl0.L1D0cache.num_tag_array_writes         1431                       # number of tag array writes
+system.cp_cntrl0.L1D1cache.demand_hits              0                       # Number of cache demand hits
+system.cp_cntrl0.L1D1cache.demand_misses            0                       # Number of cache demand misses
+system.cp_cntrl0.L1D1cache.demand_accesses            0                       # Number of cache demand accesses
+system.cp_cntrl0.L1Icache.demand_hits               0                       # Number of cache demand hits
+system.cp_cntrl0.L1Icache.demand_misses          1287                       # Number of cache demand misses
+system.cp_cntrl0.L1Icache.demand_accesses         1287                       # Number of cache demand accesses
+system.cp_cntrl0.L1Icache.num_data_array_reads        85994                       # number of data array reads
+system.cp_cntrl0.L1Icache.num_data_array_writes           67                       # number of data array writes
+system.cp_cntrl0.L1Icache.num_tag_array_reads        87697                       # number of tag array reads
+system.cp_cntrl0.L1Icache.num_tag_array_writes           67                       # number of tag array writes
+system.cp_cntrl0.L2cache.demand_hits                0                       # Number of cache demand hits
+system.cp_cntrl0.L2cache.demand_misses           1549                       # Number of cache demand misses
+system.cp_cntrl0.L2cache.demand_accesses         1549                       # Number of cache demand accesses
+system.cp_cntrl0.L2cache.num_data_array_reads          167                       # number of data array reads
+system.cp_cntrl0.L2cache.num_data_array_writes        11993                       # number of data array writes
+system.cp_cntrl0.L2cache.num_tag_array_reads        12092                       # number of tag array reads
+system.cp_cntrl0.L2cache.num_tag_array_writes         1694                       # number of tag array writes
+system.cpu0.clk_domain.clock                      500                       # Clock period in ticks
+system.cpu0.apic_clk_domain.clock                8000                       # Clock period in ticks
+system.cpu0.workload.num_syscalls                  21                       # Number of system calls
+system.cpu0.numCycles                          937709                       # number of cpu cycles simulated
+system.cpu0.numWorkItemsStarted                     0                       # number of work items this cpu started
+system.cpu0.numWorkItemsCompleted                   0                       # number of work items this cpu completed
+system.cpu0.committedInsts                      66963                       # Number of instructions committed
+system.cpu0.committedOps                       137705                       # Number of ops (including micro ops) committed
+system.cpu0.num_int_alu_accesses               136380                       # Number of integer alu accesses
+system.cpu0.num_fp_alu_accesses                  1279                       # Number of float alu accesses
+system.cpu0.num_func_calls                       3196                       # number of times a function call or return occured
+system.cpu0.num_conditional_control_insts        12151                       # number of instructions that are conditional controls
+system.cpu0.num_int_insts                      136380                       # number of integer instructions
+system.cpu0.num_fp_insts                         1279                       # number of float instructions
+system.cpu0.num_int_register_reads             257490                       # number of times the integer registers were read
+system.cpu0.num_int_register_writes            110039                       # number of times the integer registers were written
+system.cpu0.num_fp_register_reads                1981                       # number of times the floating registers were read
+system.cpu0.num_fp_register_writes                981                       # number of times the floating registers were written
+system.cpu0.num_cc_register_reads               78262                       # number of times the CC registers were read
+system.cpu0.num_cc_register_writes              42183                       # number of times the CC registers were written
+system.cpu0.num_mem_refs                        27198                       # number of memory refs
+system.cpu0.num_load_insts                      16684                       # Number of load instructions
+system.cpu0.num_store_insts                     10514                       # Number of store instructions
+system.cpu0.num_idle_cycles               7323.003984                       # Number of idle cycles
+system.cpu0.num_busy_cycles              930385.996016                       # Number of busy cycles
+system.cpu0.not_idle_fraction                0.992191                       # Percentage of non-idle cycles
+system.cpu0.idle_fraction                    0.007809                       # Percentage of idle cycles
+system.cpu0.Branches                            16199                       # Number of branches fetched
+system.cpu0.op_class::No_OpClass                  615      0.45%      0.45% # Class of executed instruction
+system.cpu0.op_class::IntAlu                   108791     79.00%     79.45% # Class of executed instruction
+system.cpu0.op_class::IntMult                      13      0.01%     79.46% # Class of executed instruction
+system.cpu0.op_class::IntDiv                      138      0.10%     79.56% # Class of executed instruction
+system.cpu0.op_class::FloatAdd                    950      0.69%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatCmp                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatCvt                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatMult                     0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatDiv                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::FloatSqrt                     0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdAdd                       0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdAddAcc                    0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdAlu                       0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdCmp                       0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdCvt                       0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdMisc                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdMult                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdMultAcc                   0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdShift                     0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdShiftAcc                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdSqrt                      0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatAdd                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatAlu                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatCmp                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatCvt                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatDiv                  0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatMisc                 0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatMult                 0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatMultAcc              0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::SimdFloatSqrt                 0      0.00%     80.25% # Class of executed instruction
+system.cpu0.op_class::MemRead                   16684     12.12%     92.36% # Class of executed instruction
+system.cpu0.op_class::MemWrite                  10514      7.64%    100.00% # Class of executed instruction
+system.cpu0.op_class::IprAccess                     0      0.00%    100.00% # Class of executed instruction
+system.cpu0.op_class::InstPrefetch                  0      0.00%    100.00% # Class of executed instruction
+system.cpu0.op_class::total                    137705                       # Class of executed instruction
+system.cpu1.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.cpu1.clk_domain.clock                     1000                       # Clock period in ticks
+system.cpu1.CUs0.wavefronts00.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts00.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts00.timesBlockedDueRAWDependencies          271                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::samples           39                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::mean     0.794872                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::stdev     0.863880                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::0-1           28     71.79%     71.79% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::2-3           11     28.21%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.src_reg_operand_dist::total           39                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::samples           39                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::mean     0.589744                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::stdev     0.498310                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::0-1           39    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts00.dst_reg_operand_dist::total           39                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts01.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts01.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts01.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts02.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts02.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts02.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts03.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts03.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts03.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts04.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts04.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts04.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts05.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts05.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts05.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts06.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts06.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts06.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts07.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts07.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts07.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts08.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts08.timesBlockedDueRAWDependencies          252                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts08.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts09.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts09.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts09.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts10.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts10.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts10.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts11.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts11.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts11.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts12.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts12.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts12.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts13.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts13.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts13.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts14.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts14.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts14.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts15.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts15.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts15.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts16.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts16.timesBlockedDueRAWDependencies          243                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts16.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts17.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts17.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts17.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts18.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts18.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts18.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts19.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts19.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts19.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts20.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts20.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts20.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts21.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts21.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts21.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts22.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts22.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts22.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts23.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts23.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts23.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts24.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts24.timesBlockedDueRAWDependencies          228                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts24.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts25.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts25.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts25.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts26.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts26.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts26.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts27.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts27.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts27.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts28.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts28.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts28.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts29.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts29.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts29.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts30.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts30.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts30.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs0.wavefronts31.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs0.wavefronts31.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.wavefronts31.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::samples           43                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::mean     5.813953                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::stdev     2.683777                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::underflows            0      0.00%      0.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::1            0      0.00%      0.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::2            8     18.60%     18.60% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::3            8     18.60%     37.21% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::4            1      2.33%     39.53% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::5            0      0.00%     39.53% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::6            1      2.33%     41.86% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::7            0      0.00%     41.86% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::8           25     58.14%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::9            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::10            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::11            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::12            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::13            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::14            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::15            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::16            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::17            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::18            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::19            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::20            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::21            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::22            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::23            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::24            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::25            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::26            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::27            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::28            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::29            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::30            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::31            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::32            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::overflows            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::min_value            2                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::max_value            8                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.FetchStage.inst_fetch_instr_returned::total           43                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs0.ExecStage.num_cycles_with_no_issue         4103                       # number of cycles the CU issues nothing
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_issued          133                       # number of cycles the CU issued at least one instruction
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU0           30                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU1           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU2           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::ALU3           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::GM           18                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instrtype_issue::LM            6                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU0         1359                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU1          382                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU2          338                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::ALU3          302                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::GM          373                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.num_cycles_with_instr_type_no_issue::LM           26                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs0.ExecStage.spc::samples          4236                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::mean         0.033286                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::stdev        0.190882                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::underflows            0      0.00%      0.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::0                4103     96.86%     96.86% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::1                 126      2.97%     99.83% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::2                   6      0.14%     99.98% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::3                   1      0.02%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::4                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::5                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::6                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::overflows            0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::min_value            0                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::max_value            3                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.spc::total            4236                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs0.ExecStage.num_transitions_active_to_idle           68                       # number of CU transitions from active to idle
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::samples           68                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::mean    53.455882                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::stdev   203.558231                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::underflows            0      0.00%      0.00% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::0-4           48     70.59%     70.59% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::5-9            8     11.76%     82.35% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::10-14            1      1.47%     83.82% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::15-19            1      1.47%     85.29% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::20-24            2      2.94%     88.24% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::25-29            1      1.47%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::30-34            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::35-39            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::40-44            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::45-49            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::50-54            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::55-59            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::60-64            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::65-69            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::70-74            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::75            0      0.00%     89.71% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::overflows            7     10.29%    100.00% # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::min_value            1                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::max_value         1317                       # duration of idle periods in cycles
+system.cpu1.CUs0.ExecStage.idle_duration_in_cycles::total           68                       # duration of idle periods in cycles
+system.cpu1.CUs0.GlobalMemPipeline.load_vrf_bank_conflict_cycles            0                       # total number of cycles GM data are delayed before updating the VRF
+system.cpu1.CUs0.LocalMemPipeline.load_vrf_bank_conflict_cycles            0                       # total number of cycles LDS data are delayed before updating the VRF
+system.cpu1.CUs0.tlb_requests                     769                       # number of uncoalesced requests
+system.cpu1.CUs0.tlb_cycles              -318202403000                       # total number of cycles for all uncoalesced requests
+system.cpu1.CUs0.avg_translation_latency -413787260.078023                       # Avg. translation latency for data translations
+system.cpu1.CUs0.TLB_hits_distribution::page_table          769                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs0.TLB_hits_distribution::L1_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs0.TLB_hits_distribution::L2_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs0.TLB_hits_distribution::L3_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs0.lds_bank_access_cnt               54                       # Total number of LDS bank accesses
+system.cpu1.CUs0.lds_bank_conflicts::samples            6                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::mean            8                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::stdev     6.196773                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::underflows            0      0.00%      0.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::0-1            2     33.33%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::2-3            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::4-5            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::6-7            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::8-9            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::10-11            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::12-13            4     66.67%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::14-15            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::16-17            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::18-19            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::20-21            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::22-23            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::24-25            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::26-27            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::28-29            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::30-31            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::32-33            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::34-35            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::36-37            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::38-39            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::40-41            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::42-43            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::44-45            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::46-47            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::48-49            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::50-51            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::52-53            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::54-55            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::56-57            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::58-59            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::60-61            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::62-63            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::64             0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::overflows            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::min_value            0                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::max_value           12                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.lds_bank_conflicts::total            6                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs0.page_divergence_dist::samples           17                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::mean            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::stdev            0                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::underflows            0      0.00%      0.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::1-4           17    100.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::5-8            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::9-12            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::13-16            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::17-20            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::21-24            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::25-28            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::29-32            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::33-36            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::37-40            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::41-44            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::45-48            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::49-52            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::53-56            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::57-60            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::61-64            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::overflows            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::min_value            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::max_value            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.page_divergence_dist::total           17                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs0.global_mem_instr_cnt              17                       # dynamic global memory instructions count
+system.cpu1.CUs0.local_mem_instr_cnt                6                       # dynamic local memory intruction count
+system.cpu1.CUs0.wg_blocked_due_lds_alloc            0                       # Workgroup blocked due to LDS capacity
+system.cpu1.CUs0.num_instr_executed               141                       # number of instructions executed
+system.cpu1.CUs0.inst_exec_rate::samples          141                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::mean       84.978723                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::stdev     240.114362                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::underflows            0      0.00%      0.00% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::0-1                1      0.71%      0.71% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::2-3               12      8.51%      9.22% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::4-5               53     37.59%     46.81% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::6-7               31     21.99%     68.79% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::8-9                3      2.13%     70.92% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::10                 1      0.71%     71.63% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::overflows           40     28.37%    100.00% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::min_value            1                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::max_value         1320                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.inst_exec_rate::total            141                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs0.num_vec_ops_executed            6769                       # number of vec ops executed (e.g. VSZ/inst)
+system.cpu1.CUs0.num_total_cycles                4236                       # number of cycles the CU ran for
+system.cpu1.CUs0.vpc                         1.597970                       # Vector Operations per cycle (this CU only)
+system.cpu1.CUs0.ipc                         0.033286                       # Instructions per cycle (this CU only)
+system.cpu1.CUs0.warp_execution_dist::samples          141                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::mean    48.007092                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::stdev    23.719942                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::underflows            0      0.00%      0.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::1-4            5      3.55%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::5-8            0      0.00%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::9-12            0      0.00%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::13-16           36     25.53%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::17-20            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::21-24            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::25-28            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::29-32            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::33-36            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::37-40            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::41-44            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::45-48            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::49-52            8      5.67%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::53-56            0      0.00%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::57-60            0      0.00%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::61-64           92     65.25%    100.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::overflows            0      0.00%    100.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::min_value            1                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::max_value           64                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.warp_execution_dist::total          141                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs0.gmem_lanes_execution_dist::samples           18                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::mean    37.833333                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::stdev    27.064737                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::underflows            0      0.00%      0.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::1-4            1      5.56%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::5-8            0      0.00%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::9-12            0      0.00%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::13-16            8     44.44%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::17-20            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::21-24            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::25-28            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::29-32            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::33-36            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::37-40            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::41-44            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::45-48            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::49-52            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::53-56            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::57-60            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::61-64            9     50.00%    100.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::overflows            0      0.00%    100.00% # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::min_value            1                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::max_value           64                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.gmem_lanes_execution_dist::total           18                       # number of active lanes per global memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::samples            6                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::mean    19.500000                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::stdev    22.322634                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::underflows            0      0.00%      0.00% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::1-4            1     16.67%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::5-8            0      0.00%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::9-12            0      0.00%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::13-16            4     66.67%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::17-20            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::21-24            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::25-28            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::29-32            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::33-36            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::37-40            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::41-44            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::45-48            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::49-52            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::53-56            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::57-60            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::61-64            1     16.67%    100.00% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::overflows            0      0.00%    100.00% # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::min_value            1                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::max_value           64                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.lmem_lanes_execution_dist::total            6                       # number of active lanes per local memory instruction
+system.cpu1.CUs0.num_alu_insts_executed           118                       # Number of dynamic non-GM memory insts executed
+system.cpu1.CUs0.times_wg_blocked_due_vgpr_alloc            0                       # Number of times WGs are blocked due to VGPR allocation per SIMD
+system.cpu1.CUs0.num_CAS_ops                        0                       # number of compare and swap operations
+system.cpu1.CUs0.num_failed_CAS_ops                 0                       # number of compare and swap operations that failed
+system.cpu1.CUs0.num_completed_wfs                  4                       # number of completed wavefronts
+system.cpu1.CUs1.wavefronts00.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts00.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts00.timesBlockedDueRAWDependencies          276                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::samples           39                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::mean     0.794872                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::stdev     0.863880                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::0-1           28     71.79%     71.79% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::2-3           11     28.21%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.src_reg_operand_dist::total           39                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::samples           39                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::mean     0.589744                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::stdev     0.498310                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::0-1           39    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts00.dst_reg_operand_dist::total           39                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts01.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts01.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts01.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts02.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts02.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts02.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts03.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts03.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts03.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts04.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts04.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts04.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts05.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts05.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts05.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts06.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts06.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts06.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts07.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts07.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts07.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts08.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts08.timesBlockedDueRAWDependencies          254                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts08.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts09.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts09.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts09.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts10.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts10.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts10.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts11.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts11.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts11.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts12.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts12.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts12.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts13.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts13.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts13.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts14.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts14.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts14.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts15.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts15.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts15.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts16.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts16.timesBlockedDueRAWDependencies          251                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts16.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts17.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts17.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts17.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts18.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts18.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts18.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts19.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts19.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts19.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts20.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts20.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts20.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts21.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts21.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts21.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts22.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts22.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts22.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts23.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts23.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts23.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts24.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts24.timesBlockedDueRAWDependencies          236                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::samples           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::mean     0.852941                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::stdev     0.857493                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::0-1           24     70.59%     70.59% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::2-3           10     29.41%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::4            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::max_value            2                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.src_reg_operand_dist::total           34                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::samples           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::mean     0.617647                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::stdev     0.493270                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::underflows            0      0.00%      0.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::0-1           34    100.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::2-3            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::overflows            0      0.00%    100.00% # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::max_value            1                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts24.dst_reg_operand_dist::total           34                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts25.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts25.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts25.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts26.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts26.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts26.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts27.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts27.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts27.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts28.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts28.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts28.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts29.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts29.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts29.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts30.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts30.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts30.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.timesBlockedDueVrfPortAvail            0                       # number of times instructions are blocked due to VRF port availability
+system.cpu1.CUs1.wavefronts31.timesBlockedDueWAXDependencies            0                       # number of times the wf's instructions are blocked due to WAW or WAR dependencies
+system.cpu1.CUs1.wavefronts31.timesBlockedDueRAWDependencies            0                       # number of times the wf's instructions are blocked due to RAW dependencies
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::samples            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::mean          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::stdev          nan                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::underflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::0-1            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::2-3            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::4            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::overflows            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::min_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::max_value            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.src_reg_operand_dist::total            0                       # number of executed instructions with N source register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::samples            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::mean          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::stdev          nan                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::underflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::0-1            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::2-3            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::overflows            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::min_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::max_value            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.wavefronts31.dst_reg_operand_dist::total            0                       # number of executed instructions with N destination register operands
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::samples           43                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::mean     5.813953                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::stdev     2.683777                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::underflows            0      0.00%      0.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::1            0      0.00%      0.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::2            8     18.60%     18.60% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::3            8     18.60%     37.21% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::4            1      2.33%     39.53% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::5            0      0.00%     39.53% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::6            1      2.33%     41.86% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::7            0      0.00%     41.86% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::8           25     58.14%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::9            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::10            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::11            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::12            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::13            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::14            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::15            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::16            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::17            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::18            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::19            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::20            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::21            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::22            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::23            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::24            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::25            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::26            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::27            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::28            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::29            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::30            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::31            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::32            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::overflows            0      0.00%    100.00% # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::min_value            2                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::max_value            8                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.FetchStage.inst_fetch_instr_returned::total           43                       # For each instruction fetch request recieved record how many instructions you got from it
+system.cpu1.CUs1.ExecStage.num_cycles_with_no_issue         4105                       # number of cycles the CU issues nothing
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_issued          131                       # number of cycles the CU issued at least one instruction
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU0           30                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU1           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU2           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::ALU3           29                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::GM           18                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instrtype_issue::LM            6                       # Number of cycles at least one instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU0         1525                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU1          346                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU2          363                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::ALU3          363                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::GM          363                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.num_cycles_with_instr_type_no_issue::LM           33                       # Number of cycles no instruction of specific type issued
+system.cpu1.CUs1.ExecStage.spc::samples          4236                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::mean         0.033286                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::stdev        0.194558                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::underflows            0      0.00%      0.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::0                4105     96.91%     96.91% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::1                 123      2.90%     99.81% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::2                   6      0.14%     99.95% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::3                   2      0.05%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::4                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::5                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::6                   0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::overflows            0      0.00%    100.00% # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::min_value            0                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::max_value            3                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.spc::total            4236                       # Execution units active per cycle (Exec unit=SIMD,MemPipe)
+system.cpu1.CUs1.ExecStage.num_transitions_active_to_idle           74                       # number of CU transitions from active to idle
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::samples           74                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::mean    51.891892                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::stdev   210.095188                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::underflows            0      0.00%      0.00% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::0-4           56     75.68%     75.68% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::5-9            7      9.46%     85.14% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::10-14            0      0.00%     85.14% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::15-19            2      2.70%     87.84% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::20-24            1      1.35%     89.19% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::25-29            1      1.35%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::30-34            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::35-39            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::40-44            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::45-49            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::50-54            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::55-59            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::60-64            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::65-69            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::70-74            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::75            0      0.00%     90.54% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::overflows            7      9.46%    100.00% # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::min_value            1                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::max_value         1321                       # duration of idle periods in cycles
+system.cpu1.CUs1.ExecStage.idle_duration_in_cycles::total           74                       # duration of idle periods in cycles
+system.cpu1.CUs1.GlobalMemPipeline.load_vrf_bank_conflict_cycles            0                       # total number of cycles GM data are delayed before updating the VRF
+system.cpu1.CUs1.LocalMemPipeline.load_vrf_bank_conflict_cycles            0                       # total number of cycles LDS data are delayed before updating the VRF
+system.cpu1.CUs1.tlb_requests                     769                       # number of uncoalesced requests
+system.cpu1.CUs1.tlb_cycles              -318199598000                       # total number of cycles for all uncoalesced requests
+system.cpu1.CUs1.avg_translation_latency -413783612.483745                       # Avg. translation latency for data translations
+system.cpu1.CUs1.TLB_hits_distribution::page_table          769                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs1.TLB_hits_distribution::L1_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs1.TLB_hits_distribution::L2_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs1.TLB_hits_distribution::L3_TLB            0                       # TLB hits distribution (0 for page table, x for Lx-TLB
+system.cpu1.CUs1.lds_bank_access_cnt               53                       # Total number of LDS bank accesses
+system.cpu1.CUs1.lds_bank_conflicts::samples            6                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::mean     7.833333                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::stdev     6.080022                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::underflows            0      0.00%      0.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::0-1            2     33.33%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::2-3            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::4-5            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::6-7            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::8-9            0      0.00%     33.33% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::10-11            1     16.67%     50.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::12-13            3     50.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::14-15            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::16-17            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::18-19            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::20-21            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::22-23            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::24-25            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::26-27            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::28-29            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::30-31            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::32-33            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::34-35            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::36-37            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::38-39            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::40-41            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::42-43            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::44-45            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::46-47            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::48-49            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::50-51            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::52-53            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::54-55            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::56-57            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::58-59            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::60-61            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::62-63            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::64             0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::overflows            0      0.00%    100.00% # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::min_value            0                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::max_value           12                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.lds_bank_conflicts::total            6                       # Number of bank conflicts per LDS memory packet
+system.cpu1.CUs1.page_divergence_dist::samples           17                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::mean            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::stdev            0                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::underflows            0      0.00%      0.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::1-4           17    100.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::5-8            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::9-12            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::13-16            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::17-20            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::21-24            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::25-28            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::29-32            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::33-36            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::37-40            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::41-44            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::45-48            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::49-52            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::53-56            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::57-60            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::61-64            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::overflows            0      0.00%    100.00% # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::min_value            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::max_value            1                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.page_divergence_dist::total           17                       # pages touched per wf (over all mem. instr.)
+system.cpu1.CUs1.global_mem_instr_cnt              17                       # dynamic global memory instructions count
+system.cpu1.CUs1.local_mem_instr_cnt                6                       # dynamic local memory intruction count
+system.cpu1.CUs1.wg_blocked_due_lds_alloc            0                       # Workgroup blocked due to LDS capacity
+system.cpu1.CUs1.num_instr_executed               141                       # number of instructions executed
+system.cpu1.CUs1.inst_exec_rate::samples          141                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::mean       86.326241                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::stdev     246.713874                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::underflows            0      0.00%      0.00% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::0-1                1      0.71%      0.71% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::2-3               12      8.51%      9.22% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::4-5               53     37.59%     46.81% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::6-7               29     20.57%     67.38% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::8-9                5      3.55%     70.92% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::10                 1      0.71%     71.63% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::overflows           40     28.37%    100.00% # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::min_value            1                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::max_value         1324                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.inst_exec_rate::total            141                       # Instruction Execution Rate: Number of executed vector instructions per cycle
+system.cpu1.CUs1.num_vec_ops_executed            6762                       # number of vec ops executed (e.g. VSZ/inst)
+system.cpu1.CUs1.num_total_cycles                4236                       # number of cycles the CU ran for
+system.cpu1.CUs1.vpc                         1.596317                       # Vector Operations per cycle (this CU only)
+system.cpu1.CUs1.ipc                         0.033286                       # Instructions per cycle (this CU only)
+system.cpu1.CUs1.warp_execution_dist::samples          141                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::mean    47.957447                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::stdev    23.818022                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::underflows            0      0.00%      0.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::1-4            5      3.55%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::5-8            0      0.00%      3.55% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::9-12            9      6.38%      9.93% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::13-16           27     19.15%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::17-20            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::21-24            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::25-28            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::29-32            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::33-36            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::37-40            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::41-44            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::45-48            0      0.00%     29.08% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::49-52            8      5.67%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::53-56            0      0.00%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::57-60            0      0.00%     34.75% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::61-64           92     65.25%    100.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::overflows            0      0.00%    100.00% # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::min_value            1                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::max_value           64                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.warp_execution_dist::total          141                       # number of lanes active per instruction (oval all instructions)
+system.cpu1.CUs1.gmem_lanes_execution_dist::samples           18                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::mean    37.722222                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::stdev    27.174394                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::underflows            0      0.00%      0.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::1-4            1      5.56%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::5-8            0      0.00%      5.56% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::9-12            2     11.11%     16.67% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::13-16            6     33.33%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::17-20            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::21-24            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::25-28            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::29-32            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::33-36            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::37-40            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::41-44            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::45-48            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::49-52            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::53-56            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::57-60            0      0.00%     50.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::61-64            9     50.00%    100.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::overflows            0      0.00%    100.00% # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::min_value            1                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::max_value           64                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.gmem_lanes_execution_dist::total           18                       # number of active lanes per global memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::samples            6                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::mean    19.333333                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::stdev    22.384518                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::underflows            0      0.00%      0.00% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::1-4            1     16.67%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::5-8            0      0.00%     16.67% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::9-12            1     16.67%     33.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::13-16            3     50.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::17-20            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::21-24            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::25-28            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::29-32            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::33-36            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::37-40            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::41-44            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::45-48            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::49-52            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::53-56            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::57-60            0      0.00%     83.33% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::61-64            1     16.67%    100.00% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::overflows            0      0.00%    100.00% # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::min_value            1                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::max_value           64                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.lmem_lanes_execution_dist::total            6                       # number of active lanes per local memory instruction
+system.cpu1.CUs1.num_alu_insts_executed           118                       # Number of dynamic non-GM memory insts executed
+system.cpu1.CUs1.times_wg_blocked_due_vgpr_alloc            0                       # Number of times WGs are blocked due to VGPR allocation per SIMD
+system.cpu1.CUs1.num_CAS_ops                        0                       # number of compare and swap operations
+system.cpu1.CUs1.num_failed_CAS_ops                 0                       # number of compare and swap operations that failed
+system.cpu1.CUs1.num_completed_wfs                  4                       # number of completed wavefronts
+system.cpu2.num_kernel_launched                     1                       # number of kernel launched
+system.dir_cntrl0.L3CacheMemory.demand_hits            0                       # Number of cache demand hits
+system.dir_cntrl0.L3CacheMemory.demand_misses            0                       # Number of cache demand misses
+system.dir_cntrl0.L3CacheMemory.demand_accesses            0                       # Number of cache demand accesses
+system.dir_cntrl0.L3CacheMemory.num_data_array_writes         1600                       # number of data array writes
+system.dir_cntrl0.L3CacheMemory.num_tag_array_reads         1602                       # number of tag array reads
+system.dir_cntrl0.L3CacheMemory.num_tag_array_writes         1572                       # number of tag array writes
+system.dispatcher_coalescer.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.dispatcher_coalescer.clk_domain.clock         1000                       # Clock period in ticks
+system.dispatcher_coalescer.uncoalesced_accesses            0                       # Number of uncoalesced TLB accesses
+system.dispatcher_coalescer.coalesced_accesses            0                       # Number of coalesced TLB accesses
+system.dispatcher_coalescer.queuing_cycles            0                       # Number of cycles spent in queue
+system.dispatcher_coalescer.local_queuing_cycles            0                       # Number of cycles spent in queue for all incoming reqs
+system.dispatcher_coalescer.local_latency          nan                       # Avg. latency over all incoming pkts
+system.dispatcher_tlb.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.dispatcher_tlb.clk_domain.clock           1000                       # Clock period in ticks
+system.dispatcher_tlb.local_TLB_accesses            0                       # Number of TLB accesses
+system.dispatcher_tlb.local_TLB_hits                0                       # Number of TLB hits
+system.dispatcher_tlb.local_TLB_misses              0                       # Number of TLB misses
+system.dispatcher_tlb.local_TLB_miss_rate          nan                       # TLB miss rate
+system.dispatcher_tlb.global_TLB_accesses            0                       # Number of TLB accesses
+system.dispatcher_tlb.global_TLB_hits               0                       # Number of TLB hits
+system.dispatcher_tlb.global_TLB_misses             0                       # Number of TLB misses
+system.dispatcher_tlb.global_TLB_miss_rate          nan                       # TLB miss rate
+system.dispatcher_tlb.access_cycles                 0                       # Cycles spent accessing this TLB level
+system.dispatcher_tlb.page_table_cycles             0                       # Cycles spent accessing the page table
+system.dispatcher_tlb.unique_pages                  0                       # Number of unique pages touched
+system.dispatcher_tlb.local_cycles                  0                       # Number of cycles spent in queue for all incoming reqs
+system.dispatcher_tlb.local_latency               nan                       # Avg. latency over incoming coalesced reqs
+system.dispatcher_tlb.avg_reuse_distance            0                       # avg. reuse distance over all pages (in ticks)
+system.l1_coalescer0.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l1_coalescer0.clk_domain.clock            1000                       # Clock period in ticks
+system.l1_coalescer0.uncoalesced_accesses          778                       # Number of uncoalesced TLB accesses
+system.l1_coalescer0.coalesced_accesses             0                       # Number of coalesced TLB accesses
+system.l1_coalescer0.queuing_cycles                 0                       # Number of cycles spent in queue
+system.l1_coalescer0.local_queuing_cycles            0                       # Number of cycles spent in queue for all incoming reqs
+system.l1_coalescer0.local_latency                  0                       # Avg. latency over all incoming pkts
+system.l1_coalescer1.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l1_coalescer1.clk_domain.clock            1000                       # Clock period in ticks
+system.l1_coalescer1.uncoalesced_accesses          769                       # Number of uncoalesced TLB accesses
+system.l1_coalescer1.coalesced_accesses             0                       # Number of coalesced TLB accesses
+system.l1_coalescer1.queuing_cycles                 0                       # Number of cycles spent in queue
+system.l1_coalescer1.local_queuing_cycles            0                       # Number of cycles spent in queue for all incoming reqs
+system.l1_coalescer1.local_latency                  0                       # Avg. latency over all incoming pkts
+system.l1_tlb0.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l1_tlb0.clk_domain.clock                  1000                       # Clock period in ticks
+system.l1_tlb0.local_TLB_accesses                 778                       # Number of TLB accesses
+system.l1_tlb0.local_TLB_hits                     774                       # Number of TLB hits
+system.l1_tlb0.local_TLB_misses                     4                       # Number of TLB misses
+system.l1_tlb0.local_TLB_miss_rate           0.514139                       # TLB miss rate
+system.l1_tlb0.global_TLB_accesses                778                       # Number of TLB accesses
+system.l1_tlb0.global_TLB_hits                    774                       # Number of TLB hits
+system.l1_tlb0.global_TLB_misses                    4                       # Number of TLB misses
+system.l1_tlb0.global_TLB_miss_rate          0.514139                       # TLB miss rate
+system.l1_tlb0.access_cycles                        0                       # Cycles spent accessing this TLB level
+system.l1_tlb0.page_table_cycles                    0                       # Cycles spent accessing the page table
+system.l1_tlb0.unique_pages                         4                       # Number of unique pages touched
+system.l1_tlb0.local_cycles                         0                       # Number of cycles spent in queue for all incoming reqs
+system.l1_tlb0.local_latency                        0                       # Avg. latency over incoming coalesced reqs
+system.l1_tlb0.avg_reuse_distance                   0                       # avg. reuse distance over all pages (in ticks)
+system.l1_tlb1.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l1_tlb1.clk_domain.clock                  1000                       # Clock period in ticks
+system.l1_tlb1.local_TLB_accesses                 769                       # Number of TLB accesses
+system.l1_tlb1.local_TLB_hits                     766                       # Number of TLB hits
+system.l1_tlb1.local_TLB_misses                     3                       # Number of TLB misses
+system.l1_tlb1.local_TLB_miss_rate           0.390117                       # TLB miss rate
+system.l1_tlb1.global_TLB_accesses                769                       # Number of TLB accesses
+system.l1_tlb1.global_TLB_hits                    766                       # Number of TLB hits
+system.l1_tlb1.global_TLB_misses                    3                       # Number of TLB misses
+system.l1_tlb1.global_TLB_miss_rate          0.390117                       # TLB miss rate
+system.l1_tlb1.access_cycles                        0                       # Cycles spent accessing this TLB level
+system.l1_tlb1.page_table_cycles                    0                       # Cycles spent accessing the page table
+system.l1_tlb1.unique_pages                         3                       # Number of unique pages touched
+system.l1_tlb1.local_cycles                         0                       # Number of cycles spent in queue for all incoming reqs
+system.l1_tlb1.local_latency                        0                       # Avg. latency over incoming coalesced reqs
+system.l1_tlb1.avg_reuse_distance                   0                       # avg. reuse distance over all pages (in ticks)
+system.l2_coalescer.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l2_coalescer.clk_domain.clock             1000                       # Clock period in ticks
+system.l2_coalescer.uncoalesced_accesses            8                       # Number of uncoalesced TLB accesses
+system.l2_coalescer.coalesced_accesses              1                       # Number of coalesced TLB accesses
+system.l2_coalescer.queuing_cycles               8000                       # Number of cycles spent in queue
+system.l2_coalescer.local_queuing_cycles         1000                       # Number of cycles spent in queue for all incoming reqs
+system.l2_coalescer.local_latency                 125                       # Avg. latency over all incoming pkts
+system.l2_tlb.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l2_tlb.clk_domain.clock                   1000                       # Clock period in ticks
+system.l2_tlb.local_TLB_accesses                    8                       # Number of TLB accesses
+system.l2_tlb.local_TLB_hits                        3                       # Number of TLB hits
+system.l2_tlb.local_TLB_misses                      5                       # Number of TLB misses
+system.l2_tlb.local_TLB_miss_rate           62.500000                       # TLB miss rate
+system.l2_tlb.global_TLB_accesses                  15                       # Number of TLB accesses
+system.l2_tlb.global_TLB_hits                       3                       # Number of TLB hits
+system.l2_tlb.global_TLB_misses                    12                       # Number of TLB misses
+system.l2_tlb.global_TLB_miss_rate                 80                       # TLB miss rate
+system.l2_tlb.access_cycles                    552008                       # Cycles spent accessing this TLB level
+system.l2_tlb.page_table_cycles                     0                       # Cycles spent accessing the page table
+system.l2_tlb.unique_pages                          5                       # Number of unique pages touched
+system.l2_tlb.local_cycles                      69001                       # Number of cycles spent in queue for all incoming reqs
+system.l2_tlb.local_latency               8625.125000                       # Avg. latency over incoming coalesced reqs
+system.l2_tlb.avg_reuse_distance                    0                       # avg. reuse distance over all pages (in ticks)
+system.l3_coalescer.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l3_coalescer.clk_domain.clock             1000                       # Clock period in ticks
+system.l3_coalescer.uncoalesced_accesses            5                       # Number of uncoalesced TLB accesses
+system.l3_coalescer.coalesced_accesses              1                       # Number of coalesced TLB accesses
+system.l3_coalescer.queuing_cycles               8000                       # Number of cycles spent in queue
+system.l3_coalescer.local_queuing_cycles         1000                       # Number of cycles spent in queue for all incoming reqs
+system.l3_coalescer.local_latency                 200                       # Avg. latency over all incoming pkts
+system.l3_tlb.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.l3_tlb.clk_domain.clock                   1000                       # Clock period in ticks
+system.l3_tlb.local_TLB_accesses                    5                       # Number of TLB accesses
+system.l3_tlb.local_TLB_hits                        0                       # Number of TLB hits
+system.l3_tlb.local_TLB_misses                      5                       # Number of TLB misses
+system.l3_tlb.local_TLB_miss_rate                 100                       # TLB miss rate
+system.l3_tlb.global_TLB_accesses                  12                       # Number of TLB accesses
+system.l3_tlb.global_TLB_hits                       0                       # Number of TLB hits
+system.l3_tlb.global_TLB_misses                    12                       # Number of TLB misses
+system.l3_tlb.global_TLB_miss_rate                100                       # TLB miss rate
+system.l3_tlb.access_cycles                   1200000                       # Cycles spent accessing this TLB level
+system.l3_tlb.page_table_cycles               6000000                       # Cycles spent accessing the page table
+system.l3_tlb.unique_pages                          5                       # Number of unique pages touched
+system.l3_tlb.local_cycles                     150000                       # Number of cycles spent in queue for all incoming reqs
+system.l3_tlb.local_latency                     30000                       # Avg. latency over incoming coalesced reqs
+system.l3_tlb.avg_reuse_distance                    0                       # avg. reuse distance over all pages (in ticks)
+system.piobus.trans_dist::WriteReq                 94                       # Transaction distribution
+system.piobus.trans_dist::WriteResp                94                       # Transaction distribution
+system.piobus.pkt_count_system.cp_cntrl0.sequencer.mem-master-port::system.cpu2.pio          188                       # Packet count per connected master and slave (bytes)
+system.piobus.pkt_count::total                    188                       # Packet count per connected master and slave (bytes)
+system.piobus.pkt_size_system.cp_cntrl0.sequencer.mem-master-port::system.cpu2.pio          748                       # Cumulative packet size per connected master and slave (bytes)
+system.piobus.pkt_size::total                     748                       # Cumulative packet size per connected master and slave (bytes)
+system.piobus.reqLayer0.occupancy              188000                       # Layer occupancy (ticks)
+system.piobus.reqLayer0.utilization               0.0                       # Layer utilization (%)
+system.piobus.respLayer0.occupancy              94000                       # Layer occupancy (ticks)
+system.piobus.respLayer0.utilization              0.0                       # Layer utilization (%)
+system.rb_cntrl0.cacheMemory.demand_hits            0                       # Number of cache demand hits
+system.rb_cntrl0.cacheMemory.demand_misses            0                       # Number of cache demand misses
+system.rb_cntrl0.cacheMemory.demand_accesses            0                       # Number of cache demand accesses
+system.rb_cntrl0.cacheMemory.num_tag_array_reads         1553                       # number of tag array reads
+system.rb_cntrl0.cacheMemory.num_tag_array_writes         3123                       # number of tag array writes
+system.reg_cntrl0.cacheMemory.demand_hits            0                       # Number of cache demand hits
+system.reg_cntrl0.cacheMemory.demand_misses            0                       # Number of cache demand misses
+system.reg_cntrl0.cacheMemory.demand_accesses            0                       # Number of cache demand accesses
+system.reg_cntrl0.cacheMemory.num_tag_array_reads          279                       # number of tag array reads
+system.reg_cntrl0.cacheMemory.num_tag_array_writes          279                       # number of tag array writes
+system.ruby.network.ext_links0.int_node.percent_links_utilized     0.122493                      
+system.ruby.network.ext_links0.int_node.msg_count.Data::0           16                      
+system.ruby.network.ext_links0.int_node.msg_count.Request_Control::0         1558                      
+system.ruby.network.ext_links0.int_node.msg_count.Request_Control::5          279                      
+system.ruby.network.ext_links0.int_node.msg_count.Request_Control::7          279                      
+system.ruby.network.ext_links0.int_node.msg_count.Request_Control::8            8                      
+system.ruby.network.ext_links0.int_node.msg_count.Response_Data::2         1577                      
+system.ruby.network.ext_links0.int_node.msg_count.Response_Control::2          303                      
+system.ruby.network.ext_links0.int_node.msg_count.Response_Control::4           34                      
+system.ruby.network.ext_links0.int_node.msg_count.Writeback_Control::2           24                      
+system.ruby.network.ext_links0.int_node.msg_count.Unblock_Control::4         1556                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Data::0         1152                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Request_Control::0        12464                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Request_Control::5         2232                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Request_Control::7         2232                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Request_Control::8           64                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Response_Data::2       113544                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Response_Control::2         2424                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Response_Control::4          272                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Writeback_Control::2          192                      
+system.ruby.network.ext_links0.int_node.msg_bytes.Unblock_Control::4        12448                      
+system.ruby.network.ext_links2.int_node.percent_links_utilized     0.185852                      
+system.ruby.network.ext_links2.int_node.msg_count.Control::0           23                      
+system.ruby.network.ext_links2.int_node.msg_count.Request_Control::0         3098                      
+system.ruby.network.ext_links2.int_node.msg_count.Request_Control::7          274                      
+system.ruby.network.ext_links2.int_node.msg_count.Request_Control::8            4                      
+system.ruby.network.ext_links2.int_node.msg_count.Response_Data::2         1568                      
+system.ruby.network.ext_links2.int_node.msg_count.Response_Control::2          281                      
+system.ruby.network.ext_links2.int_node.msg_count.Response_Control::4           23                      
+system.ruby.network.ext_links2.int_node.msg_count.Unblock_Control::4         3098                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Control::0          184                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::0        24784                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::7         2192                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Request_Control::8           32                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Response_Data::2       112896                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Response_Control::2         2248                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Response_Control::4          184                      
+system.ruby.network.ext_links2.int_node.msg_bytes.Unblock_Control::4        24784                      
+system.tcp_cntrl0.L1cache.demand_hits               0                       # Number of cache demand hits
+system.tcp_cntrl0.L1cache.demand_misses             0                       # Number of cache demand misses
+system.tcp_cntrl0.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.tcp_cntrl0.L1cache.num_data_array_reads            6                       # number of data array reads
+system.tcp_cntrl0.L1cache.num_data_array_writes           11                       # number of data array writes
+system.tcp_cntrl0.L1cache.num_tag_array_reads         1297                       # number of tag array reads
+system.tcp_cntrl0.L1cache.num_tag_array_writes           11                       # number of tag array writes
+system.tcp_cntrl0.L1cache.num_tag_array_stalls         1271                       # number of stalls caused by tag array
+system.tcp_cntrl0.L1cache.num_data_array_stalls            2                       # number of stalls caused by data array
+system.tcp_cntrl0.coalescer.gpu_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl0.coalescer.gpu_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl0.coalescer.gpu_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl0.coalescer.gpu_ld_misses            5                       # loads that miss in the GPU
+system.tcp_cntrl0.coalescer.gpu_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl0.coalescer.gpu_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl0.coalescer.gpu_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl0.coalescer.gpu_st_misses            9                       # stores that miss in the GPU
+system.tcp_cntrl0.coalescer.cp_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl0.coalescer.cp_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl0.coalescer.cp_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl0.coalescer.cp_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl0.coalescer.cp_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl0.coalescer.cp_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl0.coalescer.cp_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl0.coalescer.cp_st_misses            0                       # stores that miss in the GPU
+system.ruby.network.ext_links4.int_node.percent_links_utilized     0.003510                      
+system.ruby.network.ext_links4.int_node.msg_count.Control::0           11                      
+system.ruby.network.ext_links4.int_node.msg_count.Data::0           34                      
+system.ruby.network.ext_links4.int_node.msg_count.Data::1           18                      
+system.ruby.network.ext_links4.int_node.msg_count.Request_Control::0           16                      
+system.ruby.network.ext_links4.int_node.msg_count.Request_Control::1            9                      
+system.ruby.network.ext_links4.int_node.msg_count.Request_Control::7            5                      
+system.ruby.network.ext_links4.int_node.msg_count.Request_Control::8            4                      
+system.ruby.network.ext_links4.int_node.msg_count.Response_Data::2            9                      
+system.ruby.network.ext_links4.int_node.msg_count.Response_Data::3           11                      
+system.ruby.network.ext_links4.int_node.msg_count.Response_Control::2           22                      
+system.ruby.network.ext_links4.int_node.msg_count.Response_Control::4           11                      
+system.ruby.network.ext_links4.int_node.msg_count.Writeback_Control::2           16                      
+system.ruby.network.ext_links4.int_node.msg_count.Writeback_Control::3           16                      
+system.ruby.network.ext_links4.int_node.msg_count.Unblock_Control::4           32                      
+system.ruby.network.ext_links4.int_node.msg_bytes.Control::0           88                      
+system.ruby.network.ext_links4.int_node.msg_bytes.Data::0         2448                      
+system.ruby.network.ext_links4.int_node.msg_bytes.Data::1         1296                      
+system.ruby.network.ext_links4.int_node.msg_bytes.Request_Control::0          128                      
+system.ruby.network.ext_links4.int_node.msg_bytes.Request_Control::1           72                      
+system.ruby.network.ext_links4.int_node.msg_bytes.Request_Control::7           40                      
+system.ruby.network.ext_links4.int_node.msg_bytes.Request_Control::8           32                      
+system.ruby.network.ext_links4.int_node.msg_bytes.Response_Data::2          648                      
+system.ruby.network.ext_links4.int_node.msg_bytes.Response_Data::3          792                      
+system.ruby.network.ext_links4.int_node.msg_bytes.Response_Control::2          176                      
+system.ruby.network.ext_links4.int_node.msg_bytes.Response_Control::4           88                      
+system.ruby.network.ext_links4.int_node.msg_bytes.Writeback_Control::2          128                      
+system.ruby.network.ext_links4.int_node.msg_bytes.Writeback_Control::3          128                      
+system.ruby.network.ext_links4.int_node.msg_bytes.Unblock_Control::4          256                      
+system.tcp_cntrl1.L1cache.demand_hits               0                       # Number of cache demand hits
+system.tcp_cntrl1.L1cache.demand_misses             0                       # Number of cache demand misses
+system.tcp_cntrl1.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.tcp_cntrl1.L1cache.num_data_array_reads            6                       # number of data array reads
+system.tcp_cntrl1.L1cache.num_data_array_writes           11                       # number of data array writes
+system.tcp_cntrl1.L1cache.num_tag_array_reads         1297                       # number of tag array reads
+system.tcp_cntrl1.L1cache.num_tag_array_writes           11                       # number of tag array writes
+system.tcp_cntrl1.L1cache.num_tag_array_stalls         1271                       # number of stalls caused by tag array
+system.tcp_cntrl1.L1cache.num_data_array_stalls            2                       # number of stalls caused by data array
+system.tcp_cntrl1.coalescer.gpu_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl1.coalescer.gpu_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl1.coalescer.gpu_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl1.coalescer.gpu_ld_misses            5                       # loads that miss in the GPU
+system.tcp_cntrl1.coalescer.gpu_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl1.coalescer.gpu_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl1.coalescer.gpu_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl1.coalescer.gpu_st_misses            9                       # stores that miss in the GPU
+system.tcp_cntrl1.coalescer.cp_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl1.coalescer.cp_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl1.coalescer.cp_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl1.coalescer.cp_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl1.coalescer.cp_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl1.coalescer.cp_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl1.coalescer.cp_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl1.coalescer.cp_st_misses            0                       # stores that miss in the GPU
+system.sqc_cntrl0.L1cache.demand_hits               0                       # Number of cache demand hits
+system.sqc_cntrl0.L1cache.demand_misses             0                       # Number of cache demand misses
+system.sqc_cntrl0.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.sqc_cntrl0.L1cache.num_data_array_reads           86                       # number of data array reads
+system.sqc_cntrl0.L1cache.num_tag_array_reads           91                       # number of tag array reads
+system.sqc_cntrl0.L1cache.num_tag_array_writes           10                       # number of tag array writes
+system.sqc_cntrl0.sequencer.load_waiting_on_load           98                       # Number of times a load aliased with a pending load
+system.tcc_cntrl0.L2cache.demand_hits               0                       # Number of cache demand hits
+system.tcc_cntrl0.L2cache.demand_misses             0                       # Number of cache demand misses
+system.tcc_cntrl0.L2cache.demand_accesses            0                       # Number of cache demand accesses
+system.tcc_cntrl0.L2cache.num_data_array_writes            9                       # number of data array writes
+system.tcc_cntrl0.L2cache.num_tag_array_reads           45                       # number of tag array reads
+system.tcc_cntrl0.L2cache.num_tag_array_writes           21                       # number of tag array writes
+system.tcc_rb_cntrl0.cacheMemory.demand_hits            0                       # Number of cache demand hits
+system.tcc_rb_cntrl0.cacheMemory.demand_misses            0                       # Number of cache demand misses
+system.tcc_rb_cntrl0.cacheMemory.demand_accesses            0                       # Number of cache demand accesses
+system.tcc_rb_cntrl0.cacheMemory.num_tag_array_reads           29                       # number of tag array reads
+system.tcc_rb_cntrl0.cacheMemory.num_tag_array_writes           89                       # number of tag array writes
+system.tcc_rb_cntrl0.cacheMemory.num_tag_array_stalls           20                       # number of stalls caused by tag array
+system.ruby.network.msg_count.Control              34                      
+system.ruby.network.msg_count.Data                 68                      
+system.ruby.network.msg_count.Request_Control         5534                      
+system.ruby.network.msg_count.Response_Data         3165                      
+system.ruby.network.msg_count.Response_Control          674                      
+system.ruby.network.msg_count.Writeback_Control           56                      
+system.ruby.network.msg_count.Unblock_Control         4686                      
+system.ruby.network.msg_byte.Control              272                      
+system.ruby.network.msg_byte.Data                4896                      
+system.ruby.network.msg_byte.Request_Control        44272                      
+system.ruby.network.msg_byte.Response_Data       227880                      
+system.ruby.network.msg_byte.Response_Control         5392                      
+system.ruby.network.msg_byte.Writeback_Control          448                      
+system.ruby.network.msg_byte.Unblock_Control        37488                      
+system.sqc_coalescer.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.sqc_coalescer.clk_domain.clock            1000                       # Clock period in ticks
+system.sqc_coalescer.uncoalesced_accesses           86                       # Number of uncoalesced TLB accesses
+system.sqc_coalescer.coalesced_accesses            66                       # Number of coalesced TLB accesses
+system.sqc_coalescer.queuing_cycles            288000                       # Number of cycles spent in queue
+system.sqc_coalescer.local_queuing_cycles       288000                       # Number of cycles spent in queue for all incoming reqs
+system.sqc_coalescer.local_latency        3348.837209                       # Avg. latency over all incoming pkts
+system.sqc_tlb.clk_domain.voltage_domain.voltage            1                       # Voltage in Volts
+system.sqc_tlb.clk_domain.clock                  1000                       # Clock period in ticks
+system.sqc_tlb.local_TLB_accesses                  66                       # Number of TLB accesses
+system.sqc_tlb.local_TLB_hits                      65                       # Number of TLB hits
+system.sqc_tlb.local_TLB_misses                     1                       # Number of TLB misses
+system.sqc_tlb.local_TLB_miss_rate           1.515152                       # TLB miss rate
+system.sqc_tlb.global_TLB_accesses                 86                       # Number of TLB accesses
+system.sqc_tlb.global_TLB_hits                     78                       # Number of TLB hits
+system.sqc_tlb.global_TLB_misses                    8                       # Number of TLB misses
+system.sqc_tlb.global_TLB_miss_rate          9.302326                       # TLB miss rate
+system.sqc_tlb.access_cycles                    86008                       # Cycles spent accessing this TLB level
+system.sqc_tlb.page_table_cycles                    0                       # Cycles spent accessing the page table
+system.sqc_tlb.unique_pages                         1                       # Number of unique pages touched
+system.sqc_tlb.local_cycles                     66001                       # Number of cycles spent in queue for all incoming reqs
+system.sqc_tlb.local_latency              1000.015152                       # Avg. latency over incoming coalesced reqs
+system.sqc_tlb.avg_reuse_distance                   0                       # avg. reuse distance over all pages (in ticks)
+system.ruby.network.ext_links0.int_node.throttle0.link_utilization     0.091873                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Data::0           16                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Request_Control::0         1279                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Request_Control::5          279                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Response_Data::2           19                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Response_Control::2           17                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_count.Unblock_Control::4         1556                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Data::0         1152                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Request_Control::0        10232                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Request_Control::5         2232                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Response_Data::2         1368                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Response_Control::2          136                      
+system.ruby.network.ext_links0.int_node.throttle0.msg_bytes.Unblock_Control::4        12448                      
+system.ruby.network.ext_links0.int_node.throttle1.link_utilization     0.015277                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_count.Request_Control::0          279                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_count.Response_Control::2          286                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_count.Writeback_Control::2            8                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Request_Control::0         2232                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Response_Control::2         2288                      
+system.ruby.network.ext_links0.int_node.throttle1.msg_bytes.Writeback_Control::2           64                      
+system.ruby.network.ext_links0.int_node.throttle2.link_utilization     0.379702                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_count.Request_Control::7          274                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_count.Request_Control::8            4                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_count.Response_Data::2         1549                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_count.Response_Control::4           23                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Request_Control::7         2192                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Request_Control::8           32                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Response_Data::2       111528                      
+system.ruby.network.ext_links0.int_node.throttle2.msg_bytes.Response_Control::4          184                      
+system.ruby.network.ext_links0.int_node.throttle3.link_utilization     0.003119                      
+system.ruby.network.ext_links0.int_node.throttle3.msg_count.Request_Control::7            5                      
+system.ruby.network.ext_links0.int_node.throttle3.msg_count.Request_Control::8            4                      
+system.ruby.network.ext_links0.int_node.throttle3.msg_count.Response_Data::2            9                      
+system.ruby.network.ext_links0.int_node.throttle3.msg_count.Response_Control::4           11                      
+system.ruby.network.ext_links0.int_node.throttle3.msg_count.Writeback_Control::2           16                      
+system.ruby.network.ext_links0.int_node.throttle3.msg_bytes.Request_Control::7           40                      
+system.ruby.network.ext_links0.int_node.throttle3.msg_bytes.Request_Control::8           32                      
+system.ruby.network.ext_links0.int_node.throttle3.msg_bytes.Response_Data::2          648                      
+system.ruby.network.ext_links0.int_node.throttle3.msg_bytes.Response_Control::4           88                      
+system.ruby.network.ext_links0.int_node.throttle3.msg_bytes.Writeback_Control::2          128                      
+system.ruby.network.ext_links2.int_node.throttle0.link_utilization     0.372290                      
+system.ruby.network.ext_links2.int_node.throttle0.msg_count.Control::0           23                      
+system.ruby.network.ext_links2.int_node.throttle0.msg_count.Response_Data::2         1549                      
+system.ruby.network.ext_links2.int_node.throttle0.msg_bytes.Control::0          184                      
+system.ruby.network.ext_links2.int_node.throttle0.msg_bytes.Response_Data::2       111528                      
+system.ruby.network.ext_links2.int_node.throttle1.link_utilization     0.090620                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_count.Request_Control::0         1549                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_count.Request_Control::7          274                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_count.Request_Control::8            4                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_count.Response_Control::4           23                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_count.Unblock_Control::4         1549                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Request_Control::0        12392                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Request_Control::7         2192                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Request_Control::8           32                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Response_Control::4          184                      
+system.ruby.network.ext_links2.int_node.throttle1.msg_bytes.Unblock_Control::4        12392                      
+system.ruby.network.ext_links2.int_node.throttle2.link_utilization     0.094646                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_count.Request_Control::0         1549                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_count.Response_Data::2           19                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_count.Response_Control::2          281                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_count.Unblock_Control::4         1549                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Request_Control::0        12392                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Response_Data::2         1368                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Response_Control::2         2248                      
+system.ruby.network.ext_links2.int_node.throttle2.msg_bytes.Unblock_Control::4        12392                      
+system.ruby.network.ext_links4.int_node.throttle0.link_utilization     0.000933                      
+system.ruby.network.ext_links4.int_node.throttle0.msg_count.Response_Data::3            3                      
+system.ruby.network.ext_links4.int_node.throttle0.msg_count.Writeback_Control::3            8                      
+system.ruby.network.ext_links4.int_node.throttle0.msg_bytes.Response_Data::3          216                      
+system.ruby.network.ext_links4.int_node.throttle0.msg_bytes.Writeback_Control::3           64                      
+system.ruby.network.ext_links4.int_node.throttle1.link_utilization     0.000933                      
+system.ruby.network.ext_links4.int_node.throttle1.msg_count.Response_Data::3            3                      
+system.ruby.network.ext_links4.int_node.throttle1.msg_count.Writeback_Control::3            8                      
+system.ruby.network.ext_links4.int_node.throttle1.msg_bytes.Response_Data::3          216                      
+system.ruby.network.ext_links4.int_node.throttle1.msg_bytes.Writeback_Control::3           64                      
+system.ruby.network.ext_links4.int_node.throttle2.link_utilization     0.007438                      
+system.ruby.network.ext_links4.int_node.throttle2.msg_count.Control::0           11                      
+system.ruby.network.ext_links4.int_node.throttle2.msg_count.Data::1           18                      
+system.ruby.network.ext_links4.int_node.throttle2.msg_count.Request_Control::1            9                      
+system.ruby.network.ext_links4.int_node.throttle2.msg_count.Response_Data::2            9                      
+system.ruby.network.ext_links4.int_node.throttle2.msg_count.Writeback_Control::2           16                      
+system.ruby.network.ext_links4.int_node.throttle2.msg_bytes.Control::0           88                      
+system.ruby.network.ext_links4.int_node.throttle2.msg_bytes.Data::1         1296                      
+system.ruby.network.ext_links4.int_node.throttle2.msg_bytes.Request_Control::1           72                      
+system.ruby.network.ext_links4.int_node.throttle2.msg_bytes.Response_Data::2          648                      
+system.ruby.network.ext_links4.int_node.throttle2.msg_bytes.Writeback_Control::2          128                      
+system.ruby.network.ext_links4.int_node.throttle3.link_utilization     0.001200                      
+system.ruby.network.ext_links4.int_node.throttle3.msg_count.Response_Data::3            5                      
+system.ruby.network.ext_links4.int_node.throttle3.msg_bytes.Response_Data::3          360                      
+system.ruby.network.ext_links4.int_node.throttle4.link_utilization     0.005705                      
+system.ruby.network.ext_links4.int_node.throttle4.msg_count.Data::0           18                      
+system.ruby.network.ext_links4.int_node.throttle4.msg_count.Request_Control::0            7                      
+system.ruby.network.ext_links4.int_node.throttle4.msg_count.Request_Control::7            5                      
+system.ruby.network.ext_links4.int_node.throttle4.msg_count.Request_Control::8            4                      
+system.ruby.network.ext_links4.int_node.throttle4.msg_count.Response_Control::4           11                      
+system.ruby.network.ext_links4.int_node.throttle4.msg_count.Unblock_Control::4           25                      
+system.ruby.network.ext_links4.int_node.throttle4.msg_bytes.Data::0         1296                      
+system.ruby.network.ext_links4.int_node.throttle4.msg_bytes.Request_Control::0           56                      
+system.ruby.network.ext_links4.int_node.throttle4.msg_bytes.Request_Control::7           40                      
+system.ruby.network.ext_links4.int_node.throttle4.msg_bytes.Request_Control::8           32                      
+system.ruby.network.ext_links4.int_node.throttle4.msg_bytes.Response_Control::4           88                      
+system.ruby.network.ext_links4.int_node.throttle4.msg_bytes.Unblock_Control::4          200                      
+system.ruby.network.ext_links4.int_node.throttle5.link_utilization     0.004852                      
+system.ruby.network.ext_links4.int_node.throttle5.msg_count.Data::0           16                      
+system.ruby.network.ext_links4.int_node.throttle5.msg_count.Request_Control::0            9                      
+system.ruby.network.ext_links4.int_node.throttle5.msg_count.Response_Control::2           22                      
+system.ruby.network.ext_links4.int_node.throttle5.msg_count.Unblock_Control::4            7                      
+system.ruby.network.ext_links4.int_node.throttle5.msg_bytes.Data::0         1152                      
+system.ruby.network.ext_links4.int_node.throttle5.msg_bytes.Request_Control::0           72                      
+system.ruby.network.ext_links4.int_node.throttle5.msg_bytes.Response_Control::2          176                      
+system.ruby.network.ext_links4.int_node.throttle5.msg_bytes.Unblock_Control::4           56                      
+system.ruby.CorePair_Controller.C0_Load_L1miss          193      0.00%      0.00%
+system.ruby.CorePair_Controller.C0_Load_L1hit        16142      0.00%      0.00%
+system.ruby.CorePair_Controller.Ifetch0_L1hit        85994      0.00%      0.00%
+system.ruby.CorePair_Controller.Ifetch0_L1miss         1101      0.00%      0.00%
+system.ruby.CorePair_Controller.C0_Store_L1miss          327      0.00%      0.00%
+system.ruby.CorePair_Controller.C0_Store_L1hit        10446      0.00%      0.00%
+system.ruby.CorePair_Controller.NB_AckS          1047      0.00%      0.00%
+system.ruby.CorePair_Controller.NB_AckM           329      0.00%      0.00%
+system.ruby.CorePair_Controller.NB_AckE           173      0.00%      0.00%
+system.ruby.CorePair_Controller.L1I_Repl          602      0.00%      0.00%
+system.ruby.CorePair_Controller.L1D0_Repl           28      0.00%      0.00%
+system.ruby.CorePair_Controller.L2_to_L1D0            7      0.00%      0.00%
+system.ruby.CorePair_Controller.L2_to_L1I           67      0.00%      0.00%
+system.ruby.CorePair_Controller.PrbInvData           15      0.00%      0.00%
+system.ruby.CorePair_Controller.PrbInvDataDemand            2      0.00%      0.00%
+system.ruby.CorePair_Controller.PrbShrData            4      0.00%      0.00%
+system.ruby.CorePair_Controller.PrbShrDataDemand            2      0.00%      0.00%
+system.ruby.CorePair_Controller.I.C0_Load_L1miss          186      0.00%      0.00%
+system.ruby.CorePair_Controller.I.Ifetch0_L1miss         1034      0.00%      0.00%
+system.ruby.CorePair_Controller.I.C0_Store_L1miss          325      0.00%      0.00%
+system.ruby.CorePair_Controller.I.PrbInvDataDemand            1      0.00%      0.00%
+system.ruby.CorePair_Controller.S.C0_Load_L1hit          643      0.00%      0.00%
+system.ruby.CorePair_Controller.S.Ifetch0_L1hit        85994      0.00%      0.00%
+system.ruby.CorePair_Controller.S.Ifetch0_L1miss           67      0.00%      0.00%
+system.ruby.CorePair_Controller.S.C0_Store_L1hit            4      0.00%      0.00%
+system.ruby.CorePair_Controller.S.L1I_Repl          602      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.C0_Load_L1miss            2      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.C0_Load_L1hit         2728      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.C0_Store_L1hit           50      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.L1D0_Repl           16      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.PrbInvData            1      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.PrbShrData            1      0.00%      0.00%
+system.ruby.CorePair_Controller.E0.PrbShrDataDemand            1      0.00%      0.00%
+system.ruby.CorePair_Controller.O.PrbInvData            4      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.C0_Load_L1miss            5      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.C0_Load_L1hit        12771      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.C0_Store_L1miss            2      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.C0_Store_L1hit        10392      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.L1D0_Repl           12      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.PrbInvData           10      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.PrbInvDataDemand            1      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.PrbShrData            3      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.PrbShrDataDemand            1      0.00%      0.00%
+system.ruby.CorePair_Controller.I_M0.NB_AckM          325      0.00%      0.00%
+system.ruby.CorePair_Controller.I_E0S.NB_AckS           13      0.00%      0.00%
+system.ruby.CorePair_Controller.I_E0S.NB_AckE          173      0.00%      0.00%
+system.ruby.CorePair_Controller.Si_F0.L2_to_L1I           67      0.00%      0.00%
+system.ruby.CorePair_Controller.S_M0.NB_AckM            4      0.00%      0.00%
+system.ruby.CorePair_Controller.S0.NB_AckS         1034      0.00%      0.00%
+system.ruby.CorePair_Controller.E0_F.L2_to_L1D0            2      0.00%      0.00%
+system.ruby.CorePair_Controller.M0_F.L2_to_L1D0            5      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlkS           190      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlkM            31      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlk             56      0.00%      0.00%
+system.ruby.Directory_Controller.WriteThrough            1      0.00%      0.00%
+system.ruby.Directory_Controller.Atomic             1      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlkSP          844      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlkMP          298      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlkP           137      0.00%      0.00%
+system.ruby.Directory_Controller.WriteThroughP           15      0.00%      0.00%
+system.ruby.Directory_Controller.AtomicP            1      0.00%      0.00%
+system.ruby.Directory_Controller.CPUPrbResp           28      0.00%      0.00%
+system.ruby.Directory_Controller.LastCPUPrbResp            8      0.00%      0.00%
+system.ruby.Directory_Controller.ProbeAcksComplete          271      0.00%      0.00%
+system.ruby.Directory_Controller.L3Hit             11      0.00%      0.00%
+system.ruby.Directory_Controller.MemData         1563      0.00%      0.00%
+system.ruby.Directory_Controller.CoreUnblock         1556      0.00%      0.00%
+system.ruby.Directory_Controller.UnblockWriteThrough           18      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlkS          190      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlkM           31      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlk           56      0.00%      0.00%
+system.ruby.Directory_Controller.U.WriteThrough            1      0.00%      0.00%
+system.ruby.Directory_Controller.U.Atomic            1      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlkSP          844      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlkMP          298      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlkP          137      0.00%      0.00%
+system.ruby.Directory_Controller.U.WriteThroughP           15      0.00%      0.00%
+system.ruby.Directory_Controller.U.AtomicP            1      0.00%      0.00%
+system.ruby.Directory_Controller.U.CPUPrbResp           28      0.00%      0.00%
+system.ruby.Directory_Controller.BS_M.MemData         1034      0.00%      0.00%
+system.ruby.Directory_Controller.BM_M.MemData          347      0.00%      0.00%
+system.ruby.Directory_Controller.B_M.L3Hit           11      0.00%      0.00%
+system.ruby.Directory_Controller.B_M.MemData          180      0.00%      0.00%
+system.ruby.Directory_Controller.BS_PM.ProbeAcksComplete          190      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.LastCPUPrbResp            4      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.ProbeAcksComplete           29      0.00%      0.00%
+system.ruby.Directory_Controller.B_PM.LastCPUPrbResp            2      0.00%      0.00%
+system.ruby.Directory_Controller.B_PM.ProbeAcksComplete           52      0.00%      0.00%
+system.ruby.Directory_Controller.B_PM.MemData            2      0.00%      0.00%
+system.ruby.Directory_Controller.B_Pm.LastCPUPrbResp            2      0.00%      0.00%
+system.ruby.Directory_Controller.B.CoreUnblock         1556      0.00%      0.00%
+system.ruby.Directory_Controller.B.UnblockWriteThrough           18      0.00%      0.00%
+system.ruby.RegionBuffer_Controller.CPURead |        1220     99.43%     99.43% |           7      0.57%    100.00%
+system.ruby.RegionBuffer_Controller.CPURead::total         1227                      
+system.ruby.RegionBuffer_Controller.CPUWrite |         331     89.95%     89.95% |          37     10.05%    100.00%
+system.ruby.RegionBuffer_Controller.CPUWrite::total          368                      
+system.ruby.RegionBuffer_Controller.PrivateNotify |         272     98.91%     98.91% |           3      1.09%    100.00%
+system.ruby.RegionBuffer_Controller.PrivateNotify::total          275                      
+system.ruby.RegionBuffer_Controller.SharedNotify |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.RegionBuffer_Controller.SharedNotify::total            4                      
+system.ruby.RegionBuffer_Controller.InvRegion |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.RegionBuffer_Controller.InvRegion::total            4                      
+system.ruby.RegionBuffer_Controller.DowngradeRegion |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.RegionBuffer_Controller.DowngradeRegion::total            4                      
+system.ruby.RegionBuffer_Controller.InvAck |          23     67.65%     67.65% |          11     32.35%    100.00%
+system.ruby.RegionBuffer_Controller.InvAck::total           34                      
+system.ruby.RegionBuffer_Controller.DoneAck |        1572     96.26%     96.26% |          61      3.74%    100.00%
+system.ruby.RegionBuffer_Controller.DoneAck::total         1633                      
+system.ruby.RegionBuffer_Controller.AllOutstanding |           6     54.55%     54.55% |           5     45.45%    100.00%
+system.ruby.RegionBuffer_Controller.AllOutstanding::total           11                      
+system.ruby.RegionBuffer_Controller.Evict |          64     66.67%     66.67% |          32     33.33%    100.00%
+system.ruby.RegionBuffer_Controller.Evict::total           96                      
+system.ruby.RegionBuffer_Controller.LastAck_PrbResp |           4     50.00%     50.00% |           4     50.00%    100.00%
+system.ruby.RegionBuffer_Controller.LastAck_PrbResp::total            8                      
+system.ruby.RegionBuffer_Controller.StallAccess |           0      0.00%      0.00% |          16    100.00%    100.00%
+system.ruby.RegionBuffer_Controller.StallAccess::total           16                      
+system.ruby.RegionBuffer_Controller.NP.CPURead |         243     98.78%     98.78% |           3      1.22%    100.00%
+system.ruby.RegionBuffer_Controller.NP.CPURead::total          246                      
+system.ruby.RegionBuffer_Controller.NP.CPUWrite |          29     96.67%     96.67% |           1      3.33%    100.00%
+system.ruby.RegionBuffer_Controller.NP.CPUWrite::total           30                      
+system.ruby.RegionBuffer_Controller.P.CPURead |         965     99.59%     99.59% |           4      0.41%    100.00%
+system.ruby.RegionBuffer_Controller.P.CPURead::total          969                      
+system.ruby.RegionBuffer_Controller.P.CPUWrite |         298     94.90%     94.90% |          16      5.10%    100.00%
+system.ruby.RegionBuffer_Controller.P.CPUWrite::total          314                      
+system.ruby.RegionBuffer_Controller.P.InvRegion |           1    100.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RegionBuffer_Controller.P.InvRegion::total            1                      
+system.ruby.RegionBuffer_Controller.P.DowngradeRegion |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.RegionBuffer_Controller.P.DowngradeRegion::total            4                      
+system.ruby.RegionBuffer_Controller.P.DoneAck |        1535     98.52%     98.52% |          23      1.48%    100.00%
+system.ruby.RegionBuffer_Controller.P.DoneAck::total         1558                      
+system.ruby.RegionBuffer_Controller.P.StallAccess |           0      0.00%      0.00% |          15    100.00%    100.00%
+system.ruby.RegionBuffer_Controller.P.StallAccess::total           15                      
+system.ruby.RegionBuffer_Controller.S.CPURead |          12    100.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RegionBuffer_Controller.S.CPURead::total           12                      
+system.ruby.RegionBuffer_Controller.S.CPUWrite |           2     66.67%     66.67% |           1     33.33%    100.00%
+system.ruby.RegionBuffer_Controller.S.CPUWrite::total            3                      
+system.ruby.RegionBuffer_Controller.S.InvRegion |           1     33.33%     33.33% |           2     66.67%    100.00%
+system.ruby.RegionBuffer_Controller.S.InvRegion::total            3                      
+system.ruby.RegionBuffer_Controller.S.DoneAck |          14     87.50%     87.50% |           2     12.50%    100.00%
+system.ruby.RegionBuffer_Controller.S.DoneAck::total           16                      
+system.ruby.RegionBuffer_Controller.NP_PS.PrivateNotify |         270     99.26%     99.26% |           2      0.74%    100.00%
+system.ruby.RegionBuffer_Controller.NP_PS.PrivateNotify::total          272                      
+system.ruby.RegionBuffer_Controller.NP_PS.SharedNotify |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.RegionBuffer_Controller.NP_PS.SharedNotify::total            4                      
+system.ruby.RegionBuffer_Controller.NP_PS.DoneAck |           8     25.81%     25.81% |          23     74.19%    100.00%
+system.ruby.RegionBuffer_Controller.NP_PS.DoneAck::total           31                      
+system.ruby.RegionBuffer_Controller.NP_PS.StallAccess |           0      0.00%      0.00% |           1    100.00%    100.00%
+system.ruby.RegionBuffer_Controller.NP_PS.StallAccess::total            1                      
+system.ruby.RegionBuffer_Controller.S_P.CPUWrite |           0      0.00%      0.00% |          18    100.00%    100.00%
+system.ruby.RegionBuffer_Controller.S_P.CPUWrite::total           18                      
+system.ruby.RegionBuffer_Controller.S_P.PrivateNotify |           2     66.67%     66.67% |           1     33.33%    100.00%
+system.ruby.RegionBuffer_Controller.S_P.PrivateNotify::total            3                      
+system.ruby.RegionBuffer_Controller.S_P.DoneAck |          15     53.57%     53.57% |          13     46.43%    100.00%
+system.ruby.RegionBuffer_Controller.S_P.DoneAck::total           28                      
+system.ruby.RegionBuffer_Controller.P_NP.InvAck |          17     60.71%     60.71% |          11     39.29%    100.00%
+system.ruby.RegionBuffer_Controller.P_NP.InvAck::total           28                      
+system.ruby.RegionBuffer_Controller.P_NP.Evict |          32     50.00%     50.00% |          32     50.00%    100.00%
+system.ruby.RegionBuffer_Controller.P_NP.Evict::total           64                      
+system.ruby.RegionBuffer_Controller.P_NP.LastAck_PrbResp |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.RegionBuffer_Controller.P_NP.LastAck_PrbResp::total            4                      
+system.ruby.RegionBuffer_Controller.P_S.InvAck |           6    100.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RegionBuffer_Controller.P_S.InvAck::total            6                      
+system.ruby.RegionBuffer_Controller.P_S.Evict |          32    100.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RegionBuffer_Controller.P_S.Evict::total           32                      
+system.ruby.RegionBuffer_Controller.P_S.LastAck_PrbResp |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.RegionBuffer_Controller.P_S.LastAck_PrbResp::total            4                      
+system.ruby.RegionBuffer_Controller.P_NP_O.AllOutstanding |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.RegionBuffer_Controller.P_NP_O.AllOutstanding::total            4                      
+system.ruby.RegionBuffer_Controller.P_S_O.AllOutstanding |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.RegionBuffer_Controller.P_S_O.AllOutstanding::total            4                      
+system.ruby.RegionBuffer_Controller.S_O.AllOutstanding |           2     66.67%     66.67% |           1     33.33%    100.00%
+system.ruby.RegionBuffer_Controller.S_O.AllOutstanding::total            3                      
+system.ruby.RegionBuffer_Controller.SS_P.CPUWrite |           2     66.67%     66.67% |           1     33.33%    100.00%
+system.ruby.RegionBuffer_Controller.SS_P.CPUWrite::total            3                      
+system.ruby.RegionDir_Controller.SendInv            1      0.00%      0.00%
+system.ruby.RegionDir_Controller.SendUpgrade            3      0.00%      0.00%
+system.ruby.RegionDir_Controller.SendDowngrade            4      0.00%      0.00%
+system.ruby.RegionDir_Controller.PrivateRequest          271      0.00%      0.00%
+system.ruby.RegionDir_Controller.InvAckCore            4      0.00%      0.00%
+system.ruby.RegionDir_Controller.InvAckCoreNoShare            4      0.00%      0.00%
+system.ruby.RegionDir_Controller.CPUPrivateAck          278      0.00%      0.00%
+system.ruby.RegionDir_Controller.LastAck            8      0.00%      0.00%
+system.ruby.RegionDir_Controller.DirReadyAck            8      0.00%      0.00%
+system.ruby.RegionDir_Controller.TriggerInv            4      0.00%      0.00%
+system.ruby.RegionDir_Controller.TriggerDowngrade            4      0.00%      0.00%
+system.ruby.RegionDir_Controller.NP.PrivateRequest          271      0.00%      0.00%
+system.ruby.RegionDir_Controller.P.SendInv            1      0.00%      0.00%
+system.ruby.RegionDir_Controller.P.SendDowngrade            4      0.00%      0.00%
+system.ruby.RegionDir_Controller.S.SendUpgrade            3      0.00%      0.00%
+system.ruby.RegionDir_Controller.NP_P.CPUPrivateAck          270      0.00%      0.00%
+system.ruby.RegionDir_Controller.P_P.CPUPrivateAck            1      0.00%      0.00%
+system.ruby.RegionDir_Controller.P_S.CPUPrivateAck            4      0.00%      0.00%
+system.ruby.RegionDir_Controller.S_P.CPUPrivateAck            3      0.00%      0.00%
+system.ruby.RegionDir_Controller.P_AS.InvAckCore            4      0.00%      0.00%
+system.ruby.RegionDir_Controller.P_AS.LastAck            4      0.00%      0.00%
+system.ruby.RegionDir_Controller.S_AP.InvAckCoreNoShare            3      0.00%      0.00%
+system.ruby.RegionDir_Controller.S_AP.LastAck            3      0.00%      0.00%
+system.ruby.RegionDir_Controller.P_AP.InvAckCoreNoShare            1      0.00%      0.00%
+system.ruby.RegionDir_Controller.P_AP.LastAck            1      0.00%      0.00%
+system.ruby.RegionDir_Controller.P_AP_W.DirReadyAck            1      0.00%      0.00%
+system.ruby.RegionDir_Controller.P_AP_W.TriggerInv            1      0.00%      0.00%
+system.ruby.RegionDir_Controller.P_AS_W.DirReadyAck            4      0.00%      0.00%
+system.ruby.RegionDir_Controller.P_AS_W.TriggerDowngrade            4      0.00%      0.00%
+system.ruby.RegionDir_Controller.S_AP_W.DirReadyAck            3      0.00%      0.00%
+system.ruby.RegionDir_Controller.S_AP_W.TriggerInv            3      0.00%      0.00%
+system.ruby.LD.latency_hist::bucket_size           64                      
+system.ruby.LD.latency_hist::max_bucket           639                      
+system.ruby.LD.latency_hist::samples            16335                      
+system.ruby.LD.latency_hist::mean            2.844751                      
+system.ruby.LD.latency_hist::gmean           1.060634                      
+system.ruby.LD.latency_hist::stdev          17.742972                      
+system.ruby.LD.latency_hist              |       16149     98.86%     98.86% |          11      0.07%     98.93% |         119      0.73%     99.66% |          52      0.32%     99.98% |           2      0.01%     99.99% |           1      0.01%     99.99% |           1      0.01%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.latency_hist::total              16335                      
+system.ruby.LD.hit_latency_hist::bucket_size           64                      
+system.ruby.LD.hit_latency_hist::max_bucket          639                      
+system.ruby.LD.hit_latency_hist::samples          186                      
+system.ruby.LD.hit_latency_hist::mean      162.333333                      
+system.ruby.LD.hit_latency_hist::gmean     157.431876                      
+system.ruby.LD.hit_latency_hist::stdev      43.755298                      
+system.ruby.LD.hit_latency_hist          |           0      0.00%      0.00% |          11      5.91%      5.91% |         119     63.98%     69.89% |          52     27.96%     97.85% |           2      1.08%     98.92% |           1      0.54%     99.46% |           1      0.54%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.hit_latency_hist::total            186                      
+system.ruby.LD.miss_latency_hist::bucket_size            2                      
+system.ruby.LD.miss_latency_hist::max_bucket           19                      
+system.ruby.LD.miss_latency_hist::samples        16149                      
+system.ruby.LD.miss_latency_hist::mean       1.007802                      
+system.ruby.LD.miss_latency_hist::gmean      1.001277                      
+system.ruby.LD.miss_latency_hist::stdev      0.374686                      
+system.ruby.LD.miss_latency_hist         |       16142     99.96%     99.96% |           0      0.00%     99.96% |           0      0.00%     99.96% |           0      0.00%     99.96% |           0      0.00%     99.96% |           0      0.00%     99.96% |           0      0.00%     99.96% |           0      0.00%     99.96% |           0      0.00%     99.96% |           7      0.04%    100.00%
+system.ruby.LD.miss_latency_hist::total         16149                      
+system.ruby.ST.latency_hist::bucket_size           64                      
+system.ruby.ST.latency_hist::max_bucket           639                      
+system.ruby.ST.latency_hist::samples            10412                      
+system.ruby.ST.latency_hist::mean            5.551287                      
+system.ruby.ST.latency_hist::gmean           1.167783                      
+system.ruby.ST.latency_hist::stdev          26.172531                      
+system.ruby.ST.latency_hist              |       10087     96.88%     96.88% |           0      0.00%     96.88% |         289      2.78%     99.65% |          29      0.28%     99.93% |           4      0.04%     99.97% |           2      0.02%     99.99% |           0      0.00%     99.99% |           1      0.01%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.latency_hist::total              10412                      
+system.ruby.ST.hit_latency_hist::bucket_size           64                      
+system.ruby.ST.hit_latency_hist::max_bucket          639                      
+system.ruby.ST.hit_latency_hist::samples          325                      
+system.ruby.ST.hit_latency_hist::mean      146.809231                      
+system.ruby.ST.hit_latency_hist::gmean     143.903653                      
+system.ruby.ST.hit_latency_hist::stdev      36.751508                      
+system.ruby.ST.hit_latency_hist          |           0      0.00%      0.00% |           0      0.00%      0.00% |         289     88.92%     88.92% |          29      8.92%     97.85% |           4      1.23%     99.08% |           2      0.62%     99.69% |           0      0.00%     99.69% |           1      0.31%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.hit_latency_hist::total            325                      
+system.ruby.ST.miss_latency_hist::bucket_size            1                      
+system.ruby.ST.miss_latency_hist::max_bucket            9                      
+system.ruby.ST.miss_latency_hist::samples        10087                      
+system.ruby.ST.miss_latency_hist::mean              1                      
+system.ruby.ST.miss_latency_hist::gmean             1                      
+system.ruby.ST.miss_latency_hist         |           0      0.00%      0.00% |       10087    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.miss_latency_hist::total         10087                      
+system.ruby.IFETCH.latency_hist::bucket_size           64                      
+system.ruby.IFETCH.latency_hist::max_bucket          639                      
+system.ruby.IFETCH.latency_hist::samples        87095                      
+system.ruby.IFETCH.latency_hist::mean        2.818945                      
+system.ruby.IFETCH.latency_hist::gmean       1.063630                      
+system.ruby.IFETCH.latency_hist::stdev      17.067789                      
+system.ruby.IFETCH.latency_hist          |       86061     98.81%     98.81% |           0      0.00%     98.81% |         826      0.95%     99.76% |         185      0.21%     99.97% |           8      0.01%     99.98% |           9      0.01%     99.99% |           6      0.01%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.latency_hist::total          87095                      
+system.ruby.IFETCH.hit_latency_hist::bucket_size           64                      
+system.ruby.IFETCH.hit_latency_hist::max_bucket          639                      
+system.ruby.IFETCH.hit_latency_hist::samples         1034                      
+system.ruby.IFETCH.hit_latency_hist::mean   153.045455                      
+system.ruby.IFETCH.hit_latency_hist::gmean   149.192268                      
+system.ruby.IFETCH.hit_latency_hist::stdev    40.969954                      
+system.ruby.IFETCH.hit_latency_hist      |           0      0.00%      0.00% |           0      0.00%      0.00% |         826     79.88%     79.88% |         185     17.89%     97.78% |           8      0.77%     98.55% |           9      0.87%     99.42% |           6      0.58%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.hit_latency_hist::total         1034                      
+system.ruby.IFETCH.miss_latency_hist::bucket_size            2                      
+system.ruby.IFETCH.miss_latency_hist::max_bucket           19                      
+system.ruby.IFETCH.miss_latency_hist::samples        86061                      
+system.ruby.IFETCH.miss_latency_hist::mean     1.014013                      
+system.ruby.IFETCH.miss_latency_hist::gmean     1.002295                      
+system.ruby.IFETCH.miss_latency_hist::stdev     0.502042                      
+system.ruby.IFETCH.miss_latency_hist     |       85994     99.92%     99.92% |           0      0.00%     99.92% |           0      0.00%     99.92% |           0      0.00%     99.92% |           0      0.00%     99.92% |           0      0.00%     99.92% |           0      0.00%     99.92% |           0      0.00%     99.92% |           0      0.00%     99.92% |          67      0.08%    100.00%
+system.ruby.IFETCH.miss_latency_hist::total        86061                      
+system.ruby.RMW_Read.latency_hist::bucket_size           32                      
+system.ruby.RMW_Read.latency_hist::max_bucket          319                      
+system.ruby.RMW_Read.latency_hist::samples          341                      
+system.ruby.RMW_Read.latency_hist::mean      2.671554                      
+system.ruby.RMW_Read.latency_hist::gmean     1.059947                      
+system.ruby.RMW_Read.latency_hist::stdev    15.416875                      
+system.ruby.RMW_Read.latency_hist        |         337     98.83%     98.83% |           0      0.00%     98.83% |           0      0.00%     98.83% |           0      0.00%     98.83% |           3      0.88%     99.71% |           1      0.29%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RMW_Read.latency_hist::total          341                      
+system.ruby.RMW_Read.hit_latency_hist::bucket_size           32                      
+system.ruby.RMW_Read.hit_latency_hist::max_bucket          319                      
+system.ruby.RMW_Read.hit_latency_hist::samples            4                      
+system.ruby.RMW_Read.hit_latency_hist::mean   143.500000                      
+system.ruby.RMW_Read.hit_latency_hist::gmean   143.041358                      
+system.ruby.RMW_Read.hit_latency_hist::stdev    13.403980                      
+system.ruby.RMW_Read.hit_latency_hist    |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           3     75.00%     75.00% |           1     25.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RMW_Read.hit_latency_hist::total            4                      
+system.ruby.RMW_Read.miss_latency_hist::bucket_size            1                      
+system.ruby.RMW_Read.miss_latency_hist::max_bucket            9                      
+system.ruby.RMW_Read.miss_latency_hist::samples          337                      
+system.ruby.RMW_Read.miss_latency_hist::mean            1                      
+system.ruby.RMW_Read.miss_latency_hist::gmean            1                      
+system.ruby.RMW_Read.miss_latency_hist   |           0      0.00%      0.00% |         337    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RMW_Read.miss_latency_hist::total          337                      
+system.ruby.Locked_RMW_Read.latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Read.latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Read.latency_hist::samples           10                      
+system.ruby.Locked_RMW_Read.latency_hist::mean            1                      
+system.ruby.Locked_RMW_Read.latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Read.latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Read.latency_hist::total           10                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::samples           10                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::mean            1                      
+system.ruby.Locked_RMW_Read.miss_latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Read.miss_latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Read.miss_latency_hist::total           10                      
+system.ruby.Locked_RMW_Write.latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Write.latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Write.latency_hist::samples           10                      
+system.ruby.Locked_RMW_Write.latency_hist::mean            1                      
+system.ruby.Locked_RMW_Write.latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Write.latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Write.latency_hist::total           10                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::samples           10                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::mean            1                      
+system.ruby.Locked_RMW_Write.miss_latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Write.miss_latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Write.miss_latency_hist::total           10                      
+system.ruby.L1Cache.miss_mach_latency_hist::bucket_size            1                      
+system.ruby.L1Cache.miss_mach_latency_hist::max_bucket            9                      
+system.ruby.L1Cache.miss_mach_latency_hist::samples       112580                      
+system.ruby.L1Cache.miss_mach_latency_hist::mean            1                      
+system.ruby.L1Cache.miss_mach_latency_hist::gmean            1                      
+system.ruby.L1Cache.miss_mach_latency_hist |           0      0.00%      0.00% |      112580    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.L1Cache.miss_mach_latency_hist::total       112580                      
+system.ruby.L2Cache.miss_mach_latency_hist::bucket_size            2                      
+system.ruby.L2Cache.miss_mach_latency_hist::max_bucket           19                      
+system.ruby.L2Cache.miss_mach_latency_hist::samples           74                      
+system.ruby.L2Cache.miss_mach_latency_hist::mean           19                      
+system.ruby.L2Cache.miss_mach_latency_hist::gmean    19.000000                      
+system.ruby.L2Cache.miss_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |          74    100.00%    100.00%
+system.ruby.L2Cache.miss_mach_latency_hist::total           74                      
+system.ruby.L3Cache.hit_mach_latency_hist::bucket_size           16                      
+system.ruby.L3Cache.hit_mach_latency_hist::max_bucket          159                      
+system.ruby.L3Cache.hit_mach_latency_hist::samples           11                      
+system.ruby.L3Cache.hit_mach_latency_hist::mean          107                      
+system.ruby.L3Cache.hit_mach_latency_hist::gmean   107.000000                      
+system.ruby.L3Cache.hit_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |          11    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.L3Cache.hit_mach_latency_hist::total           11                      
+system.ruby.Directory.hit_mach_latency_hist::bucket_size           64                      
+system.ruby.Directory.hit_mach_latency_hist::max_bucket          639                      
+system.ruby.Directory.hit_mach_latency_hist::samples         1538                      
+system.ruby.Directory.hit_mach_latency_hist::mean   153.155397                      
+system.ruby.Directory.hit_mach_latency_hist::gmean   149.362802                      
+system.ruby.Directory.hit_mach_latency_hist::stdev    40.587599                      
+system.ruby.Directory.hit_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |        1238     80.49%     80.49% |         266     17.30%     97.79% |          14      0.91%     98.70% |          12      0.78%     99.48% |           7      0.46%     99.93% |           1      0.07%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Directory.hit_mach_latency_hist::total         1538                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::samples        16142                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |       16142    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.L1Cache.miss_type_mach_latency_hist::total        16142                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::bucket_size            2                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::max_bucket           19                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::samples            7                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::mean           19                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::gmean    19.000000                      
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           7    100.00%    100.00%
+system.ruby.LD.L2Cache.miss_type_mach_latency_hist::total            7                      
+system.ruby.LD.L3Cache.hit_type_mach_latency_hist::bucket_size           16                      
+system.ruby.LD.L3Cache.hit_type_mach_latency_hist::max_bucket          159                      
+system.ruby.LD.L3Cache.hit_type_mach_latency_hist::samples           11                      
+system.ruby.LD.L3Cache.hit_type_mach_latency_hist::mean          107                      
+system.ruby.LD.L3Cache.hit_type_mach_latency_hist::gmean   107.000000                      
+system.ruby.LD.L3Cache.hit_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |          11    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.L3Cache.hit_type_mach_latency_hist::total           11                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::bucket_size           64                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::max_bucket          639                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::samples          175                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::mean   165.811429                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::gmean   161.300002                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::stdev    42.776536                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |         119     68.00%     68.00% |          52     29.71%     97.71% |           2      1.14%     98.86% |           1      0.57%     99.43% |           1      0.57%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.Directory.hit_type_mach_latency_hist::total          175                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::samples        10087                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |       10087    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::total        10087                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::bucket_size           64                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::max_bucket          639                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::samples          325                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::mean   146.809231                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::gmean   143.903653                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::stdev    36.751508                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |         289     88.92%     88.92% |          29      8.92%     97.85% |           4      1.23%     99.08% |           2      0.62%     99.69% |           0      0.00%     99.69% |           1      0.31%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.Directory.hit_type_mach_latency_hist::total          325                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::samples        85994                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |       85994    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.L1Cache.miss_type_mach_latency_hist::total        85994                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::bucket_size            2                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::max_bucket           19                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::samples           67                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::mean           19                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::gmean    19.000000                      
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |          67    100.00%    100.00%
+system.ruby.IFETCH.L2Cache.miss_type_mach_latency_hist::total           67                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::bucket_size           64                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::max_bucket          639                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::samples         1034                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::mean   153.045455                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::gmean   149.192268                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::stdev    40.969954                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |         826     79.88%     79.88% |         185     17.89%     97.78% |           8      0.77%     98.55% |           9      0.87%     99.42% |           6      0.58%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::total         1034                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::samples          337                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |         337    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RMW_Read.L1Cache.miss_type_mach_latency_hist::total          337                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::bucket_size           32                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::max_bucket          319                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::samples            4                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::mean   143.500000                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::gmean   143.041358                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::stdev    13.403980                      
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           3     75.00%     75.00% |           1     25.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.RMW_Read.Directory.hit_type_mach_latency_hist::total            4                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::samples           10                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Read.L1Cache.miss_type_mach_latency_hist::total           10                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::bucket_size            1                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::max_bucket            9                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::samples           10                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::mean            1                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::gmean            1                      
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist |           0      0.00%      0.00% |          10    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Locked_RMW_Write.L1Cache.miss_type_mach_latency_hist::total           10                      
+system.ruby.SQC_Controller.Fetch                   86      0.00%      0.00%
+system.ruby.SQC_Controller.Data                     5      0.00%      0.00%
+system.ruby.SQC_Controller.I.Fetch                  5      0.00%      0.00%
+system.ruby.SQC_Controller.I.Data                   5      0.00%      0.00%
+system.ruby.SQC_Controller.V.Fetch                 81      0.00%      0.00%
+system.ruby.TCC_Controller.RdBlk                    9      0.00%      0.00%
+system.ruby.TCC_Controller.WrVicBlk                16      0.00%      0.00%
+system.ruby.TCC_Controller.Atomic                   2      0.00%      0.00%
+system.ruby.TCC_Controller.AtomicDone               1      0.00%      0.00%
+system.ruby.TCC_Controller.Data                     9      0.00%      0.00%
+system.ruby.TCC_Controller.PrbInv                  11      0.00%      0.00%
+system.ruby.TCC_Controller.WBAck                   16      0.00%      0.00%
+system.ruby.TCC_Controller.V.PrbInv                 1      0.00%      0.00%
+system.ruby.TCC_Controller.I.RdBlk                  7      0.00%      0.00%
+system.ruby.TCC_Controller.I.WrVicBlk              16      0.00%      0.00%
+system.ruby.TCC_Controller.I.Atomic                 1      0.00%      0.00%
+system.ruby.TCC_Controller.I.PrbInv                10      0.00%      0.00%
+system.ruby.TCC_Controller.I.WBAck                 16      0.00%      0.00%
+system.ruby.TCC_Controller.IV.RdBlk                 2      0.00%      0.00%
+system.ruby.TCC_Controller.IV.Data                  7      0.00%      0.00%
+system.ruby.TCC_Controller.A.Atomic                 1      0.00%      0.00%
+system.ruby.TCC_Controller.A.AtomicDone             1      0.00%      0.00%
+system.ruby.TCC_Controller.A.Data                   2      0.00%      0.00%
+system.ruby.TCP_Controller.Load          |           5     50.00%     50.00% |           5     50.00%    100.00%
+system.ruby.TCP_Controller.Load::total             10                      
+system.ruby.TCP_Controller.StoreThrough  |           8     50.00%     50.00% |           8     50.00%    100.00%
+system.ruby.TCP_Controller.StoreThrough::total           16                      
+system.ruby.TCP_Controller.Atomic        |           1     50.00%     50.00% |           1     50.00%    100.00%
+system.ruby.TCP_Controller.Atomic::total            2                      
+system.ruby.TCP_Controller.Flush         |         768     50.00%     50.00% |         768     50.00%    100.00%
+system.ruby.TCP_Controller.Flush::total          1536                      
+system.ruby.TCP_Controller.Evict         |         512     50.00%     50.00% |         512     50.00%    100.00%
+system.ruby.TCP_Controller.Evict::total          1024                      
+system.ruby.TCP_Controller.TCC_Ack       |           3     50.00%     50.00% |           3     50.00%    100.00%
+system.ruby.TCP_Controller.TCC_Ack::total            6                      
+system.ruby.TCP_Controller.TCC_AckWB     |           8     50.00%     50.00% |           8     50.00%    100.00%
+system.ruby.TCP_Controller.TCC_AckWB::total           16                      
+system.ruby.TCP_Controller.I.Load        |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.TCP_Controller.I.Load::total            4                      
+system.ruby.TCP_Controller.I.StoreThrough |           8     50.00%     50.00% |           8     50.00%    100.00%
+system.ruby.TCP_Controller.I.StoreThrough::total           16                      
+system.ruby.TCP_Controller.I.Atomic      |           1     50.00%     50.00% |           1     50.00%    100.00%
+system.ruby.TCP_Controller.I.Atomic::total            2                      
+system.ruby.TCP_Controller.I.Flush       |         766     50.00%     50.00% |         766     50.00%    100.00%
+system.ruby.TCP_Controller.I.Flush::total         1532                      
+system.ruby.TCP_Controller.I.Evict       |         510     50.00%     50.00% |         510     50.00%    100.00%
+system.ruby.TCP_Controller.I.Evict::total         1020                      
+system.ruby.TCP_Controller.I.TCC_Ack     |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.TCP_Controller.I.TCC_Ack::total            4                      
+system.ruby.TCP_Controller.I.TCC_AckWB   |           8     50.00%     50.00% |           8     50.00%    100.00%
+system.ruby.TCP_Controller.I.TCC_AckWB::total           16                      
+system.ruby.TCP_Controller.V.Load        |           3     50.00%     50.00% |           3     50.00%    100.00%
+system.ruby.TCP_Controller.V.Load::total            6                      
+system.ruby.TCP_Controller.V.Flush       |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.TCP_Controller.V.Flush::total            4                      
+system.ruby.TCP_Controller.V.Evict       |           2     50.00%     50.00% |           2     50.00%    100.00%
+system.ruby.TCP_Controller.V.Evict::total            4                      
+system.ruby.TCP_Controller.A.TCC_Ack     |           1     50.00%     50.00% |           1     50.00%    100.00%
+system.ruby.TCP_Controller.A.TCC_Ack::total            2                      
+
+---------- End Simulation Statistics   ----------
diff --git a/tests/quick/se/04.gpu/test.py b/tests/quick/se/04.gpu/test.py
new file mode 100644
index 000000000..a074a8144
--- /dev/null
+++ b/tests/quick/se/04.gpu/test.py
@@ -0,0 +1,48 @@
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Brad Beckmann
+#
+executable = binpath('gpu-hello')
+kernel_path = os.path.dirname(executable)
+kernel_files = glob.glob(os.path.join(kernel_path, '*.asm'))
+if kernel_files:
+    print "Using GPU kernel code file(s)", ",".join(kernel_files)
+else:
+    fatal("Can't locate kernel code (.asm) in " + kernel_path)
+
+driver = ClDriver(filename="hsa", codefile=kernel_files)
+root.system.cpu[2].cl_driver = driver
+root.system.cpu[0].workload = LiveProcess(cmd = 'gpu-hello',
+                                          executable = binpath('gpu-hello'),
+                                          drivers = [driver])
+
diff --git a/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/config.ini b/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/config.ini
new file mode 100644
index 000000000..06da5f023
--- /dev/null
+++ b/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/config.ini
@@ -0,0 +1,5862 @@
+[root]
+type=Root
+children=system
+eventq_index=0
+full_system=false
+sim_quantum=0
+time_sync_enable=false
+time_sync_period=100000000
+time_sync_spin_threshold=100000
+
+[system]
+type=System
+children=clk_domain cp_cntrl0 cpu dir_cntrl0 dvfs_handler mem_ctrls ruby sqc_cntrl0 sqc_cntrl1 sys_port_proxy tcc_cntrl0 tccdir_cntrl0 tcp_cntrl0 tcp_cntrl1 tcp_cntrl2 tcp_cntrl3 tcp_cntrl4 tcp_cntrl5 tcp_cntrl6 tcp_cntrl7 voltage_domain
+boot_osflags=a
+cache_line_size=64
+clk_domain=system.clk_domain
+eventq_index=0
+exit_on_work_items=false
+init_param=0
+kernel=
+kernel_addr_check=true
+load_addr_mask=1099511627775
+load_offset=0
+mem_mode=timing
+mem_ranges=0:268435455
+memories=system.mem_ctrls
+mmap_using_noreserve=false
+multi_thread=false
+num_work_ids=16
+readfile=
+symbolfile=
+work_begin_ckpt_count=0
+work_begin_cpu_id_exit=-1
+work_begin_exit_count=0
+work_cpus_ckpt_count=0
+work_end_ckpt_count=0
+work_end_exit_count=0
+work_item_id=-1
+system_port=system.sys_port_proxy.slave[0]
+
+[system.clk_domain]
+type=SrcClockDomain
+clock=1
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.voltage_domain
+
+[system.cp_cntrl0]
+type=CorePair_Controller
+children=L1D0cache L1D1cache L1Icache L2cache mandatoryQueue probeToCore requestFromCore responseFromCore responseToCore sequencer sequencer1 triggerQueue unblockFromCore
+L1D0cache=system.cp_cntrl0.L1D0cache
+L1D1cache=system.cp_cntrl0.L1D1cache
+L1Icache=system.cp_cntrl0.L1Icache
+L2cache=system.cp_cntrl0.L2cache
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+eventq_index=0
+issue_latency=15
+l2_hit_latency=18
+mandatoryQueue=system.cp_cntrl0.mandatoryQueue
+number_of_TBEs=256
+probeToCore=system.cp_cntrl0.probeToCore
+recycle_latency=10
+requestFromCore=system.cp_cntrl0.requestFromCore
+responseFromCore=system.cp_cntrl0.responseFromCore
+responseToCore=system.cp_cntrl0.responseToCore
+ruby_system=system.ruby
+send_evictions=true
+sequencer=system.cp_cntrl0.sequencer
+sequencer1=system.cp_cntrl0.sequencer1
+system=system
+transitions_per_cycle=32
+triggerQueue=system.cp_cntrl0.triggerQueue
+unblockFromCore=system.cp_cntrl0.unblockFromCore
+version=0
+
+[system.cp_cntrl0.L1D0cache]
+type=RubyCache
+children=replacement_policy
+assoc=2
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=1
+eventq_index=0
+is_icache=false
+replacement_policy=system.cp_cntrl0.L1D0cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=256
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=1
+
+[system.cp_cntrl0.L1D0cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=2
+block_size=64
+eventq_index=0
+size=256
+
+[system.cp_cntrl0.L1D1cache]
+type=RubyCache
+children=replacement_policy
+assoc=2
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=1
+eventq_index=0
+is_icache=false
+replacement_policy=system.cp_cntrl0.L1D1cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=256
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=1
+
+[system.cp_cntrl0.L1D1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=2
+block_size=64
+eventq_index=0
+size=256
+
+[system.cp_cntrl0.L1Icache]
+type=RubyCache
+children=replacement_policy
+assoc=2
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=1
+eventq_index=0
+is_icache=false
+replacement_policy=system.cp_cntrl0.L1Icache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=256
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=1
+
+[system.cp_cntrl0.L1Icache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=2
+block_size=64
+eventq_index=0
+size=256
+
+[system.cp_cntrl0.L2cache]
+type=RubyCache
+children=replacement_policy
+assoc=2
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=1
+eventq_index=0
+is_icache=false
+replacement_policy=system.cp_cntrl0.L2cache.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=512
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=1
+
+[system.cp_cntrl0.L2cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=2
+block_size=64
+eventq_index=0
+size=512
+
+[system.cp_cntrl0.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.cp_cntrl0.probeToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[3]
+
+[system.cp_cntrl0.requestFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[2]
+
+[system.cp_cntrl0.responseFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[3]
+
+[system.cp_cntrl0.responseToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[4]
+
+[system.cp_cntrl0.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=0
+dcache=system.cp_cntrl0.L1D0cache
+dcache_hit_latency=2
+deadlock_threshold=500000
+eventq_index=0
+icache=system.cp_cntrl0.L1Icache
+icache_hit_latency=2
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=true
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=true
+version=0
+slave=system.cpu.cpuInstDataPort[0]
+
+[system.cp_cntrl0.sequencer1]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=1
+dcache=system.cp_cntrl0.L1D1cache
+dcache_hit_latency=2
+deadlock_threshold=500000
+eventq_index=0
+icache=system.cp_cntrl0.L1Icache
+icache_hit_latency=2
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=true
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=true
+version=1
+slave=system.cpu.cpuInstDataPort[1]
+
+[system.cp_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.cp_cntrl0.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[4]
+
+[system.cpu]
+type=RubyTester
+check_flush=false
+checks_to_complete=100
+clk_domain=system.clk_domain
+deadlock_threshold=50000
+eventq_index=0
+num_cpus=12
+system=system
+wakeup_frequency=10
+cpuDataPort=system.tcp_cntrl0.coalescer.slave[0] system.tcp_cntrl1.coalescer.slave[0] system.tcp_cntrl2.coalescer.slave[0] system.tcp_cntrl3.coalescer.slave[0] system.tcp_cntrl4.coalescer.slave[0] system.tcp_cntrl5.coalescer.slave[0] system.tcp_cntrl6.coalescer.slave[0] system.tcp_cntrl7.coalescer.slave[0]
+cpuInstDataPort=system.cp_cntrl0.sequencer.slave[0] system.cp_cntrl0.sequencer1.slave[0]
+cpuInstPort=system.sqc_cntrl0.sequencer.slave[0] system.sqc_cntrl1.sequencer.slave[0]
+
+[system.dir_cntrl0]
+type=Directory_Controller
+children=L3CacheMemory L3triggerQueue directory probeToCore requestFromCores responseFromCores responseFromMemory responseToCore triggerQueue unblockFromCores
+CPUonly=false
+L3CacheMemory=system.dir_cntrl0.L3CacheMemory
+L3triggerQueue=system.dir_cntrl0.L3triggerQueue
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+directory=system.dir_cntrl0.directory
+eventq_index=0
+l3_hit_latency=15
+noTCCdir=false
+number_of_TBEs=20480
+probeToCore=system.dir_cntrl0.probeToCore
+recycle_latency=10
+requestFromCores=system.dir_cntrl0.requestFromCores
+responseFromCores=system.dir_cntrl0.responseFromCores
+responseFromMemory=system.dir_cntrl0.responseFromMemory
+responseToCore=system.dir_cntrl0.responseToCore
+response_latency=30
+ruby_system=system.ruby
+system=system
+to_memory_controller_latency=1
+transitions_per_cycle=32
+triggerQueue=system.dir_cntrl0.triggerQueue
+unblockFromCores=system.dir_cntrl0.unblockFromCores
+useL3OnWT=false
+version=0
+memory=system.mem_ctrls.port
+
+[system.dir_cntrl0.L3CacheMemory]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=20
+dataArrayBanks=256.0
+eventq_index=0
+is_icache=false
+replacement_policy=system.dir_cntrl0.L3CacheMemory.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=1024
+start_index_bit=6
+tagAccessLatency=15
+tagArrayBanks=256.0
+
+[system.dir_cntrl0.L3CacheMemory.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=1024
+
+[system.dir_cntrl0.L3triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.dir_cntrl0.directory]
+type=RubyDirectoryMemory
+eventq_index=0
+numa_high_bit=5
+size=536870912
+version=0
+
+[system.dir_cntrl0.probeToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[0]
+
+[system.dir_cntrl0.requestFromCores]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[0]
+
+[system.dir_cntrl0.responseFromCores]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[1]
+
+[system.dir_cntrl0.responseFromMemory]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.dir_cntrl0.responseToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[1]
+
+[system.dir_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.dir_cntrl0.unblockFromCores]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[2]
+
+[system.dvfs_handler]
+type=DVFSHandler
+domains=
+enable=false
+eventq_index=0
+sys_clk_domain=system.clk_domain
+transition_latency=100000
+
+[system.mem_ctrls]
+type=DRAMCtrl
+IDD0=0.075000
+IDD02=0.000000
+IDD2N=0.050000
+IDD2N2=0.000000
+IDD2P0=0.000000
+IDD2P02=0.000000
+IDD2P1=0.000000
+IDD2P12=0.000000
+IDD3N=0.057000
+IDD3N2=0.000000
+IDD3P0=0.000000
+IDD3P02=0.000000
+IDD3P1=0.000000
+IDD3P12=0.000000
+IDD4R=0.187000
+IDD4R2=0.000000
+IDD4W=0.165000
+IDD4W2=0.000000
+IDD5=0.220000
+IDD52=0.000000
+IDD6=0.000000
+IDD62=0.000000
+VDD=1.500000
+VDD2=0.000000
+activation_limit=4
+addr_mapping=RoRaBaCoCh
+bank_groups_per_rank=0
+banks_per_rank=8
+burst_length=8
+channels=1
+clk_domain=system.clk_domain
+conf_table_reported=true
+device_bus_width=8
+device_rowbuffer_size=1024
+device_size=536870912
+devices_per_rank=8
+dll=true
+eventq_index=0
+in_addr_map=true
+max_accesses_per_row=16
+mem_sched_policy=frfcfs
+min_writes_per_switch=16
+null=false
+page_policy=open_adaptive
+range=0:268435455
+ranks_per_channel=2
+read_buffer_size=32
+static_backend_latency=10
+static_frontend_latency=10
+tBURST=5
+tCCD_L=0
+tCK=1
+tCL=14
+tCS=3
+tRAS=35
+tRCD=14
+tREFI=7800
+tRFC=260
+tRP=14
+tRRD=6
+tRRD_L=0
+tRTP=8
+tRTW=3
+tWR=15
+tWTR=8
+tXAW=30
+tXP=0
+tXPDLL=0
+tXS=0
+tXSDLL=0
+write_buffer_size=64
+write_high_thresh_perc=85
+write_low_thresh_perc=50
+port=system.dir_cntrl0.memory
+
+[system.ruby]
+type=RubySystem
+children=clk_domain network
+access_backing_store=false
+all_instructions=false
+block_size_bytes=64
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+hot_lines=false
+memory_size_bits=48
+num_of_sequencers=12
+number_of_virtual_networks=10
+phys_mem=Null
+randomization=true
+
+[system.ruby.clk_domain]
+type=SrcClockDomain
+clock=1
+domain_id=-1
+eventq_index=0
+init_perf_level=0
+voltage_domain=system.voltage_domain
+
+[system.ruby.network]
+type=SimpleNetwork
+children=ext_links00 ext_links01 ext_links02 ext_links03 ext_links04 ext_links05 ext_links06 ext_links07 ext_links08 ext_links09 ext_links10 ext_links11 ext_links12 ext_links13 int_link_buffers00 int_link_buffers01 int_link_buffers02 int_link_buffers03 int_link_buffers04 int_link_buffers05 int_link_buffers06 int_link_buffers07 int_link_buffers08 int_link_buffers09 int_link_buffers10 int_link_buffers11 int_link_buffers12 int_link_buffers13 int_link_buffers14 int_link_buffers15 int_link_buffers16 int_link_buffers17 int_link_buffers18 int_link_buffers19 int_link_buffers20 int_link_buffers21 int_link_buffers22 int_link_buffers23 int_link_buffers24 int_link_buffers25 int_link_buffers26 int_link_buffers27 int_link_buffers28 int_link_buffers29 int_link_buffers30 int_link_buffers31 int_link_buffers32 int_link_buffers33 int_link_buffers34 int_link_buffers35 int_link_buffers36 int_link_buffers37 int_link_buffers38 int_link_buffers39 int_links0 int_links1
+adaptive_routing=false
+buffer_size=0
+clk_domain=system.ruby.clk_domain
+control_msg_size=8
+endpoint_bandwidth=1000
+eventq_index=0
+ext_links=system.ruby.network.ext_links00 system.ruby.network.ext_links01 system.ruby.network.ext_links02 system.ruby.network.ext_links03 system.ruby.network.ext_links04 system.ruby.network.ext_links05 system.ruby.network.ext_links06 system.ruby.network.ext_links07 system.ruby.network.ext_links08 system.ruby.network.ext_links09 system.ruby.network.ext_links10 system.ruby.network.ext_links11 system.ruby.network.ext_links12 system.ruby.network.ext_links13
+int_link_buffers=system.ruby.network.int_link_buffers00 system.ruby.network.int_link_buffers01 system.ruby.network.int_link_buffers02 system.ruby.network.int_link_buffers03 system.ruby.network.int_link_buffers04 system.ruby.network.int_link_buffers05 system.ruby.network.int_link_buffers06 system.ruby.network.int_link_buffers07 system.ruby.network.int_link_buffers08 system.ruby.network.int_link_buffers09 system.ruby.network.int_link_buffers10 system.ruby.network.int_link_buffers11 system.ruby.network.int_link_buffers12 system.ruby.network.int_link_buffers13 system.ruby.network.int_link_buffers14 system.ruby.network.int_link_buffers15 system.ruby.network.int_link_buffers16 system.ruby.network.int_link_buffers17 system.ruby.network.int_link_buffers18 system.ruby.network.int_link_buffers19 system.ruby.network.int_link_buffers20 system.ruby.network.int_link_buffers21 system.ruby.network.int_link_buffers22 system.ruby.network.int_link_buffers23 system.ruby.network.int_link_buffers24 system.ruby.network.int_link_buffers25 system.ruby.network.int_link_buffers26 system.ruby.network.int_link_buffers27 system.ruby.network.int_link_buffers28 system.ruby.network.int_link_buffers29 system.ruby.network.int_link_buffers30 system.ruby.network.int_link_buffers31 system.ruby.network.int_link_buffers32 system.ruby.network.int_link_buffers33 system.ruby.network.int_link_buffers34 system.ruby.network.int_link_buffers35 system.ruby.network.int_link_buffers36 system.ruby.network.int_link_buffers37 system.ruby.network.int_link_buffers38 system.ruby.network.int_link_buffers39
+int_links=system.ruby.network.int_links0 system.ruby.network.int_links1
+netifs=
+number_of_virtual_networks=10
+routers=system.ruby.network.ext_links00.int_node system.ruby.network.ext_links01.int_node system.ruby.network.ext_links02.int_node
+ruby_system=system.ruby
+topology=Crossbar
+master=system.dir_cntrl0.requestFromCores.slave system.dir_cntrl0.responseFromCores.slave system.dir_cntrl0.unblockFromCores.slave system.cp_cntrl0.probeToCore.slave system.cp_cntrl0.responseToCore.slave system.tcp_cntrl0.probeToTCP.slave system.tcp_cntrl0.responseToTCP.slave system.tcp_cntrl1.probeToTCP.slave system.tcp_cntrl1.responseToTCP.slave system.tcp_cntrl2.probeToTCP.slave system.tcp_cntrl2.responseToTCP.slave system.tcp_cntrl3.probeToTCP.slave system.tcp_cntrl3.responseToTCP.slave system.tcp_cntrl4.probeToTCP.slave system.tcp_cntrl4.responseToTCP.slave system.tcp_cntrl5.probeToTCP.slave system.tcp_cntrl5.responseToTCP.slave system.tcp_cntrl6.probeToTCP.slave system.tcp_cntrl6.responseToTCP.slave system.tcp_cntrl7.probeToTCP.slave system.tcp_cntrl7.responseToTCP.slave system.sqc_cntrl0.probeToSQC.slave system.sqc_cntrl0.responseToSQC.slave system.sqc_cntrl1.probeToSQC.slave system.sqc_cntrl1.responseToSQC.slave system.tcc_cntrl0.responseToTCC.slave system.tccdir_cntrl0.requestFromTCP.slave system.tccdir_cntrl0.responseFromTCP.slave system.tccdir_cntrl0.unblockFromTCP.slave system.tccdir_cntrl0.probeFromNB.slave system.tccdir_cntrl0.responseFromNB.slave
+slave=system.dir_cntrl0.probeToCore.master system.dir_cntrl0.responseToCore.master system.cp_cntrl0.requestFromCore.master system.cp_cntrl0.responseFromCore.master system.cp_cntrl0.unblockFromCore.master system.tcp_cntrl0.requestFromTCP.master system.tcp_cntrl0.responseFromTCP.master system.tcp_cntrl0.unblockFromCore.master system.tcp_cntrl1.requestFromTCP.master system.tcp_cntrl1.responseFromTCP.master system.tcp_cntrl1.unblockFromCore.master system.tcp_cntrl2.requestFromTCP.master system.tcp_cntrl2.responseFromTCP.master system.tcp_cntrl2.unblockFromCore.master system.tcp_cntrl3.requestFromTCP.master system.tcp_cntrl3.responseFromTCP.master system.tcp_cntrl3.unblockFromCore.master system.tcp_cntrl4.requestFromTCP.master system.tcp_cntrl4.responseFromTCP.master system.tcp_cntrl4.unblockFromCore.master system.tcp_cntrl5.requestFromTCP.master system.tcp_cntrl5.responseFromTCP.master system.tcp_cntrl5.unblockFromCore.master system.tcp_cntrl6.requestFromTCP.master system.tcp_cntrl6.responseFromTCP.master system.tcp_cntrl6.unblockFromCore.master system.tcp_cntrl7.requestFromTCP.master system.tcp_cntrl7.responseFromTCP.master system.tcp_cntrl7.unblockFromCore.master system.sqc_cntrl0.requestFromSQC.master system.sqc_cntrl0.responseFromSQC.master system.sqc_cntrl0.unblockFromCore.master system.sqc_cntrl1.requestFromSQC.master system.sqc_cntrl1.responseFromSQC.master system.sqc_cntrl1.unblockFromCore.master system.tcc_cntrl0.responseFromTCC.master system.tccdir_cntrl0.probeToCore.master system.tccdir_cntrl0.responseToCore.master system.tccdir_cntrl0.requestToNB.master system.tccdir_cntrl0.responseToNB.master system.tccdir_cntrl0.unblockToNB.master
+
+[system.ruby.network.ext_links00]
+type=SimpleExtLink
+children=int_node
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.dir_cntrl0
+int_node=system.ruby.network.ext_links00.int_node
+latency=1
+link_id=0
+weight=1
+
+[system.ruby.network.ext_links00.int_node]
+type=Switch
+children=port_buffers000 port_buffers001 port_buffers002 port_buffers003 port_buffers004 port_buffers005 port_buffers006 port_buffers007 port_buffers008 port_buffers009 port_buffers010 port_buffers011 port_buffers012 port_buffers013 port_buffers014 port_buffers015 port_buffers016 port_buffers017 port_buffers018 port_buffers019 port_buffers020 port_buffers021 port_buffers022 port_buffers023 port_buffers024 port_buffers025 port_buffers026 port_buffers027 port_buffers028 port_buffers029 port_buffers030 port_buffers031 port_buffers032 port_buffers033 port_buffers034 port_buffers035 port_buffers036 port_buffers037 port_buffers038 port_buffers039 port_buffers040 port_buffers041 port_buffers042 port_buffers043 port_buffers044 port_buffers045 port_buffers046 port_buffers047 port_buffers048 port_buffers049 port_buffers050 port_buffers051 port_buffers052 port_buffers053 port_buffers054 port_buffers055 port_buffers056 port_buffers057 port_buffers058 port_buffers059 port_buffers060 port_buffers061 port_buffers062 port_buffers063 port_buffers064 port_buffers065 port_buffers066 port_buffers067 port_buffers068 port_buffers069 port_buffers070 port_buffers071 port_buffers072 port_buffers073 port_buffers074 port_buffers075 port_buffers076 port_buffers077 port_buffers078 port_buffers079 port_buffers080 port_buffers081 port_buffers082 port_buffers083 port_buffers084 port_buffers085 port_buffers086 port_buffers087 port_buffers088 port_buffers089 port_buffers090 port_buffers091 port_buffers092 port_buffers093 port_buffers094 port_buffers095 port_buffers096 port_buffers097 port_buffers098 port_buffers099 port_buffers100 port_buffers101 port_buffers102 port_buffers103 port_buffers104 port_buffers105 port_buffers106 port_buffers107 port_buffers108 port_buffers109 port_buffers110 port_buffers111 port_buffers112 port_buffers113 port_buffers114 port_buffers115 port_buffers116 port_buffers117 port_buffers118 port_buffers119 port_buffers120 port_buffers121 port_buffers122 port_buffers123 port_buffers124 port_buffers125 port_buffers126 port_buffers127 port_buffers128 port_buffers129 port_buffers130 port_buffers131 port_buffers132 port_buffers133 port_buffers134 port_buffers135 port_buffers136 port_buffers137 port_buffers138 port_buffers139 port_buffers140 port_buffers141 port_buffers142 port_buffers143 port_buffers144 port_buffers145 port_buffers146 port_buffers147 port_buffers148 port_buffers149 port_buffers150 port_buffers151 port_buffers152 port_buffers153 port_buffers154 port_buffers155 port_buffers156 port_buffers157 port_buffers158 port_buffers159
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+port_buffers=system.ruby.network.ext_links00.int_node.port_buffers000 system.ruby.network.ext_links00.int_node.port_buffers001 system.ruby.network.ext_links00.int_node.port_buffers002 system.ruby.network.ext_links00.int_node.port_buffers003 system.ruby.network.ext_links00.int_node.port_buffers004 system.ruby.network.ext_links00.int_node.port_buffers005 system.ruby.network.ext_links00.int_node.port_buffers006 system.ruby.network.ext_links00.int_node.port_buffers007 system.ruby.network.ext_links00.int_node.port_buffers008 system.ruby.network.ext_links00.int_node.port_buffers009 system.ruby.network.ext_links00.int_node.port_buffers010 system.ruby.network.ext_links00.int_node.port_buffers011 system.ruby.network.ext_links00.int_node.port_buffers012 system.ruby.network.ext_links00.int_node.port_buffers013 system.ruby.network.ext_links00.int_node.port_buffers014 system.ruby.network.ext_links00.int_node.port_buffers015 system.ruby.network.ext_links00.int_node.port_buffers016 system.ruby.network.ext_links00.int_node.port_buffers017 system.ruby.network.ext_links00.int_node.port_buffers018 system.ruby.network.ext_links00.int_node.port_buffers019 system.ruby.network.ext_links00.int_node.port_buffers020 system.ruby.network.ext_links00.int_node.port_buffers021 system.ruby.network.ext_links00.int_node.port_buffers022 system.ruby.network.ext_links00.int_node.port_buffers023 system.ruby.network.ext_links00.int_node.port_buffers024 system.ruby.network.ext_links00.int_node.port_buffers025 system.ruby.network.ext_links00.int_node.port_buffers026 system.ruby.network.ext_links00.int_node.port_buffers027 system.ruby.network.ext_links00.int_node.port_buffers028 system.ruby.network.ext_links00.int_node.port_buffers029 system.ruby.network.ext_links00.int_node.port_buffers030 system.ruby.network.ext_links00.int_node.port_buffers031 system.ruby.network.ext_links00.int_node.port_buffers032 system.ruby.network.ext_links00.int_node.port_buffers033 system.ruby.network.ext_links00.int_node.port_buffers034 system.ruby.network.ext_links00.int_node.port_buffers035 system.ruby.network.ext_links00.int_node.port_buffers036 system.ruby.network.ext_links00.int_node.port_buffers037 system.ruby.network.ext_links00.int_node.port_buffers038 system.ruby.network.ext_links00.int_node.port_buffers039 system.ruby.network.ext_links00.int_node.port_buffers040 system.ruby.network.ext_links00.int_node.port_buffers041 system.ruby.network.ext_links00.int_node.port_buffers042 system.ruby.network.ext_links00.int_node.port_buffers043 system.ruby.network.ext_links00.int_node.port_buffers044 system.ruby.network.ext_links00.int_node.port_buffers045 system.ruby.network.ext_links00.int_node.port_buffers046 system.ruby.network.ext_links00.int_node.port_buffers047 system.ruby.network.ext_links00.int_node.port_buffers048 system.ruby.network.ext_links00.int_node.port_buffers049 system.ruby.network.ext_links00.int_node.port_buffers050 system.ruby.network.ext_links00.int_node.port_buffers051 system.ruby.network.ext_links00.int_node.port_buffers052 system.ruby.network.ext_links00.int_node.port_buffers053 system.ruby.network.ext_links00.int_node.port_buffers054 system.ruby.network.ext_links00.int_node.port_buffers055 system.ruby.network.ext_links00.int_node.port_buffers056 system.ruby.network.ext_links00.int_node.port_buffers057 system.ruby.network.ext_links00.int_node.port_buffers058 system.ruby.network.ext_links00.int_node.port_buffers059 system.ruby.network.ext_links00.int_node.port_buffers060 system.ruby.network.ext_links00.int_node.port_buffers061 system.ruby.network.ext_links00.int_node.port_buffers062 system.ruby.network.ext_links00.int_node.port_buffers063 system.ruby.network.ext_links00.int_node.port_buffers064 system.ruby.network.ext_links00.int_node.port_buffers065 system.ruby.network.ext_links00.int_node.port_buffers066 system.ruby.network.ext_links00.int_node.port_buffers067 system.ruby.network.ext_links00.int_node.port_buffers068 system.ruby.network.ext_links00.int_node.port_buffers069 system.ruby.network.ext_links00.int_node.port_buffers070 system.ruby.network.ext_links00.int_node.port_buffers071 system.ruby.network.ext_links00.int_node.port_buffers072 system.ruby.network.ext_links00.int_node.port_buffers073 system.ruby.network.ext_links00.int_node.port_buffers074 system.ruby.network.ext_links00.int_node.port_buffers075 system.ruby.network.ext_links00.int_node.port_buffers076 system.ruby.network.ext_links00.int_node.port_buffers077 system.ruby.network.ext_links00.int_node.port_buffers078 system.ruby.network.ext_links00.int_node.port_buffers079 system.ruby.network.ext_links00.int_node.port_buffers080 system.ruby.network.ext_links00.int_node.port_buffers081 system.ruby.network.ext_links00.int_node.port_buffers082 system.ruby.network.ext_links00.int_node.port_buffers083 system.ruby.network.ext_links00.int_node.port_buffers084 system.ruby.network.ext_links00.int_node.port_buffers085 system.ruby.network.ext_links00.int_node.port_buffers086 system.ruby.network.ext_links00.int_node.port_buffers087 system.ruby.network.ext_links00.int_node.port_buffers088 system.ruby.network.ext_links00.int_node.port_buffers089 system.ruby.network.ext_links00.int_node.port_buffers090 system.ruby.network.ext_links00.int_node.port_buffers091 system.ruby.network.ext_links00.int_node.port_buffers092 system.ruby.network.ext_links00.int_node.port_buffers093 system.ruby.network.ext_links00.int_node.port_buffers094 system.ruby.network.ext_links00.int_node.port_buffers095 system.ruby.network.ext_links00.int_node.port_buffers096 system.ruby.network.ext_links00.int_node.port_buffers097 system.ruby.network.ext_links00.int_node.port_buffers098 system.ruby.network.ext_links00.int_node.port_buffers099 system.ruby.network.ext_links00.int_node.port_buffers100 system.ruby.network.ext_links00.int_node.port_buffers101 system.ruby.network.ext_links00.int_node.port_buffers102 system.ruby.network.ext_links00.int_node.port_buffers103 system.ruby.network.ext_links00.int_node.port_buffers104 system.ruby.network.ext_links00.int_node.port_buffers105 system.ruby.network.ext_links00.int_node.port_buffers106 system.ruby.network.ext_links00.int_node.port_buffers107 system.ruby.network.ext_links00.int_node.port_buffers108 system.ruby.network.ext_links00.int_node.port_buffers109 system.ruby.network.ext_links00.int_node.port_buffers110 system.ruby.network.ext_links00.int_node.port_buffers111 system.ruby.network.ext_links00.int_node.port_buffers112 system.ruby.network.ext_links00.int_node.port_buffers113 system.ruby.network.ext_links00.int_node.port_buffers114 system.ruby.network.ext_links00.int_node.port_buffers115 system.ruby.network.ext_links00.int_node.port_buffers116 system.ruby.network.ext_links00.int_node.port_buffers117 system.ruby.network.ext_links00.int_node.port_buffers118 system.ruby.network.ext_links00.int_node.port_buffers119 system.ruby.network.ext_links00.int_node.port_buffers120 system.ruby.network.ext_links00.int_node.port_buffers121 system.ruby.network.ext_links00.int_node.port_buffers122 system.ruby.network.ext_links00.int_node.port_buffers123 system.ruby.network.ext_links00.int_node.port_buffers124 system.ruby.network.ext_links00.int_node.port_buffers125 system.ruby.network.ext_links00.int_node.port_buffers126 system.ruby.network.ext_links00.int_node.port_buffers127 system.ruby.network.ext_links00.int_node.port_buffers128 system.ruby.network.ext_links00.int_node.port_buffers129 system.ruby.network.ext_links00.int_node.port_buffers130 system.ruby.network.ext_links00.int_node.port_buffers131 system.ruby.network.ext_links00.int_node.port_buffers132 system.ruby.network.ext_links00.int_node.port_buffers133 system.ruby.network.ext_links00.int_node.port_buffers134 system.ruby.network.ext_links00.int_node.port_buffers135 system.ruby.network.ext_links00.int_node.port_buffers136 system.ruby.network.ext_links00.int_node.port_buffers137 system.ruby.network.ext_links00.int_node.port_buffers138 system.ruby.network.ext_links00.int_node.port_buffers139 system.ruby.network.ext_links00.int_node.port_buffers140 system.ruby.network.ext_links00.int_node.port_buffers141 system.ruby.network.ext_links00.int_node.port_buffers142 system.ruby.network.ext_links00.int_node.port_buffers143 system.ruby.network.ext_links00.int_node.port_buffers144 system.ruby.network.ext_links00.int_node.port_buffers145 system.ruby.network.ext_links00.int_node.port_buffers146 system.ruby.network.ext_links00.int_node.port_buffers147 system.ruby.network.ext_links00.int_node.port_buffers148 system.ruby.network.ext_links00.int_node.port_buffers149 system.ruby.network.ext_links00.int_node.port_buffers150 system.ruby.network.ext_links00.int_node.port_buffers151 system.ruby.network.ext_links00.int_node.port_buffers152 system.ruby.network.ext_links00.int_node.port_buffers153 system.ruby.network.ext_links00.int_node.port_buffers154 system.ruby.network.ext_links00.int_node.port_buffers155 system.ruby.network.ext_links00.int_node.port_buffers156 system.ruby.network.ext_links00.int_node.port_buffers157 system.ruby.network.ext_links00.int_node.port_buffers158 system.ruby.network.ext_links00.int_node.port_buffers159
+router_id=0
+virt_nets=10
+
+[system.ruby.network.ext_links00.int_node.port_buffers000]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers001]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers002]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers003]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers004]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers005]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers006]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers007]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers008]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers009]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers010]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers011]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers012]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers013]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers014]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers015]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers016]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers017]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers018]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers019]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers020]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers021]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers022]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers023]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers024]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers025]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers026]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers027]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers028]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers029]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers030]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers031]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers032]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers033]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers034]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers035]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers036]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers037]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers038]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers039]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers040]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers041]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers042]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers043]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers044]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers045]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers046]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers047]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers048]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers049]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers050]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers051]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers052]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers053]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers054]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers055]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers056]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers057]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers058]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers059]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers060]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers061]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers062]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers063]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers064]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers065]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers066]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers067]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers068]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers069]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers070]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers071]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers072]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers073]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers074]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers075]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers076]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers077]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers078]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers079]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers080]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers081]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers082]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers083]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers084]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers085]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers086]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers087]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers088]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers089]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers090]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers091]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers092]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers093]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers094]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers095]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers096]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers097]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers098]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers099]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers100]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers101]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers102]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers103]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers104]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers105]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers106]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers107]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers108]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers109]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers110]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers111]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers112]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers113]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers114]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers115]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers116]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers117]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers118]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers119]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers120]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers121]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers122]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers123]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers124]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers125]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers126]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers127]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers128]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers129]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers130]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers131]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers132]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers133]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers134]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers135]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers136]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers137]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers138]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers139]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers140]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers141]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers142]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers143]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers144]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers145]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers146]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers147]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers148]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers149]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers150]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers151]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers152]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers153]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers154]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers155]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers156]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers157]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers158]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links00.int_node.port_buffers159]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01]
+type=SimpleExtLink
+children=int_node
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.cp_cntrl0
+int_node=system.ruby.network.ext_links01.int_node
+latency=1
+link_id=1
+weight=1
+
+[system.ruby.network.ext_links01.int_node]
+type=Switch
+children=port_buffers000 port_buffers001 port_buffers002 port_buffers003 port_buffers004 port_buffers005 port_buffers006 port_buffers007 port_buffers008 port_buffers009 port_buffers010 port_buffers011 port_buffers012 port_buffers013 port_buffers014 port_buffers015 port_buffers016 port_buffers017 port_buffers018 port_buffers019 port_buffers020 port_buffers021 port_buffers022 port_buffers023 port_buffers024 port_buffers025 port_buffers026 port_buffers027 port_buffers028 port_buffers029 port_buffers030 port_buffers031 port_buffers032 port_buffers033 port_buffers034 port_buffers035 port_buffers036 port_buffers037 port_buffers038 port_buffers039 port_buffers040 port_buffers041 port_buffers042 port_buffers043 port_buffers044 port_buffers045 port_buffers046 port_buffers047 port_buffers048 port_buffers049 port_buffers050 port_buffers051 port_buffers052 port_buffers053 port_buffers054 port_buffers055 port_buffers056 port_buffers057 port_buffers058 port_buffers059 port_buffers060 port_buffers061 port_buffers062 port_buffers063 port_buffers064 port_buffers065 port_buffers066 port_buffers067 port_buffers068 port_buffers069 port_buffers070 port_buffers071 port_buffers072 port_buffers073 port_buffers074 port_buffers075 port_buffers076 port_buffers077 port_buffers078 port_buffers079 port_buffers080 port_buffers081 port_buffers082 port_buffers083 port_buffers084 port_buffers085 port_buffers086 port_buffers087 port_buffers088 port_buffers089 port_buffers090 port_buffers091 port_buffers092 port_buffers093 port_buffers094 port_buffers095 port_buffers096 port_buffers097 port_buffers098 port_buffers099 port_buffers100 port_buffers101 port_buffers102 port_buffers103 port_buffers104 port_buffers105 port_buffers106 port_buffers107 port_buffers108 port_buffers109 port_buffers110 port_buffers111 port_buffers112 port_buffers113 port_buffers114 port_buffers115 port_buffers116 port_buffers117 port_buffers118 port_buffers119 port_buffers120 port_buffers121 port_buffers122 port_buffers123 port_buffers124 port_buffers125 port_buffers126 port_buffers127 port_buffers128 port_buffers129 port_buffers130 port_buffers131 port_buffers132 port_buffers133 port_buffers134 port_buffers135 port_buffers136 port_buffers137 port_buffers138 port_buffers139 port_buffers140 port_buffers141 port_buffers142 port_buffers143 port_buffers144 port_buffers145 port_buffers146 port_buffers147 port_buffers148 port_buffers149
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+port_buffers=system.ruby.network.ext_links01.int_node.port_buffers000 system.ruby.network.ext_links01.int_node.port_buffers001 system.ruby.network.ext_links01.int_node.port_buffers002 system.ruby.network.ext_links01.int_node.port_buffers003 system.ruby.network.ext_links01.int_node.port_buffers004 system.ruby.network.ext_links01.int_node.port_buffers005 system.ruby.network.ext_links01.int_node.port_buffers006 system.ruby.network.ext_links01.int_node.port_buffers007 system.ruby.network.ext_links01.int_node.port_buffers008 system.ruby.network.ext_links01.int_node.port_buffers009 system.ruby.network.ext_links01.int_node.port_buffers010 system.ruby.network.ext_links01.int_node.port_buffers011 system.ruby.network.ext_links01.int_node.port_buffers012 system.ruby.network.ext_links01.int_node.port_buffers013 system.ruby.network.ext_links01.int_node.port_buffers014 system.ruby.network.ext_links01.int_node.port_buffers015 system.ruby.network.ext_links01.int_node.port_buffers016 system.ruby.network.ext_links01.int_node.port_buffers017 system.ruby.network.ext_links01.int_node.port_buffers018 system.ruby.network.ext_links01.int_node.port_buffers019 system.ruby.network.ext_links01.int_node.port_buffers020 system.ruby.network.ext_links01.int_node.port_buffers021 system.ruby.network.ext_links01.int_node.port_buffers022 system.ruby.network.ext_links01.int_node.port_buffers023 system.ruby.network.ext_links01.int_node.port_buffers024 system.ruby.network.ext_links01.int_node.port_buffers025 system.ruby.network.ext_links01.int_node.port_buffers026 system.ruby.network.ext_links01.int_node.port_buffers027 system.ruby.network.ext_links01.int_node.port_buffers028 system.ruby.network.ext_links01.int_node.port_buffers029 system.ruby.network.ext_links01.int_node.port_buffers030 system.ruby.network.ext_links01.int_node.port_buffers031 system.ruby.network.ext_links01.int_node.port_buffers032 system.ruby.network.ext_links01.int_node.port_buffers033 system.ruby.network.ext_links01.int_node.port_buffers034 system.ruby.network.ext_links01.int_node.port_buffers035 system.ruby.network.ext_links01.int_node.port_buffers036 system.ruby.network.ext_links01.int_node.port_buffers037 system.ruby.network.ext_links01.int_node.port_buffers038 system.ruby.network.ext_links01.int_node.port_buffers039 system.ruby.network.ext_links01.int_node.port_buffers040 system.ruby.network.ext_links01.int_node.port_buffers041 system.ruby.network.ext_links01.int_node.port_buffers042 system.ruby.network.ext_links01.int_node.port_buffers043 system.ruby.network.ext_links01.int_node.port_buffers044 system.ruby.network.ext_links01.int_node.port_buffers045 system.ruby.network.ext_links01.int_node.port_buffers046 system.ruby.network.ext_links01.int_node.port_buffers047 system.ruby.network.ext_links01.int_node.port_buffers048 system.ruby.network.ext_links01.int_node.port_buffers049 system.ruby.network.ext_links01.int_node.port_buffers050 system.ruby.network.ext_links01.int_node.port_buffers051 system.ruby.network.ext_links01.int_node.port_buffers052 system.ruby.network.ext_links01.int_node.port_buffers053 system.ruby.network.ext_links01.int_node.port_buffers054 system.ruby.network.ext_links01.int_node.port_buffers055 system.ruby.network.ext_links01.int_node.port_buffers056 system.ruby.network.ext_links01.int_node.port_buffers057 system.ruby.network.ext_links01.int_node.port_buffers058 system.ruby.network.ext_links01.int_node.port_buffers059 system.ruby.network.ext_links01.int_node.port_buffers060 system.ruby.network.ext_links01.int_node.port_buffers061 system.ruby.network.ext_links01.int_node.port_buffers062 system.ruby.network.ext_links01.int_node.port_buffers063 system.ruby.network.ext_links01.int_node.port_buffers064 system.ruby.network.ext_links01.int_node.port_buffers065 system.ruby.network.ext_links01.int_node.port_buffers066 system.ruby.network.ext_links01.int_node.port_buffers067 system.ruby.network.ext_links01.int_node.port_buffers068 system.ruby.network.ext_links01.int_node.port_buffers069 system.ruby.network.ext_links01.int_node.port_buffers070 system.ruby.network.ext_links01.int_node.port_buffers071 system.ruby.network.ext_links01.int_node.port_buffers072 system.ruby.network.ext_links01.int_node.port_buffers073 system.ruby.network.ext_links01.int_node.port_buffers074 system.ruby.network.ext_links01.int_node.port_buffers075 system.ruby.network.ext_links01.int_node.port_buffers076 system.ruby.network.ext_links01.int_node.port_buffers077 system.ruby.network.ext_links01.int_node.port_buffers078 system.ruby.network.ext_links01.int_node.port_buffers079 system.ruby.network.ext_links01.int_node.port_buffers080 system.ruby.network.ext_links01.int_node.port_buffers081 system.ruby.network.ext_links01.int_node.port_buffers082 system.ruby.network.ext_links01.int_node.port_buffers083 system.ruby.network.ext_links01.int_node.port_buffers084 system.ruby.network.ext_links01.int_node.port_buffers085 system.ruby.network.ext_links01.int_node.port_buffers086 system.ruby.network.ext_links01.int_node.port_buffers087 system.ruby.network.ext_links01.int_node.port_buffers088 system.ruby.network.ext_links01.int_node.port_buffers089 system.ruby.network.ext_links01.int_node.port_buffers090 system.ruby.network.ext_links01.int_node.port_buffers091 system.ruby.network.ext_links01.int_node.port_buffers092 system.ruby.network.ext_links01.int_node.port_buffers093 system.ruby.network.ext_links01.int_node.port_buffers094 system.ruby.network.ext_links01.int_node.port_buffers095 system.ruby.network.ext_links01.int_node.port_buffers096 system.ruby.network.ext_links01.int_node.port_buffers097 system.ruby.network.ext_links01.int_node.port_buffers098 system.ruby.network.ext_links01.int_node.port_buffers099 system.ruby.network.ext_links01.int_node.port_buffers100 system.ruby.network.ext_links01.int_node.port_buffers101 system.ruby.network.ext_links01.int_node.port_buffers102 system.ruby.network.ext_links01.int_node.port_buffers103 system.ruby.network.ext_links01.int_node.port_buffers104 system.ruby.network.ext_links01.int_node.port_buffers105 system.ruby.network.ext_links01.int_node.port_buffers106 system.ruby.network.ext_links01.int_node.port_buffers107 system.ruby.network.ext_links01.int_node.port_buffers108 system.ruby.network.ext_links01.int_node.port_buffers109 system.ruby.network.ext_links01.int_node.port_buffers110 system.ruby.network.ext_links01.int_node.port_buffers111 system.ruby.network.ext_links01.int_node.port_buffers112 system.ruby.network.ext_links01.int_node.port_buffers113 system.ruby.network.ext_links01.int_node.port_buffers114 system.ruby.network.ext_links01.int_node.port_buffers115 system.ruby.network.ext_links01.int_node.port_buffers116 system.ruby.network.ext_links01.int_node.port_buffers117 system.ruby.network.ext_links01.int_node.port_buffers118 system.ruby.network.ext_links01.int_node.port_buffers119 system.ruby.network.ext_links01.int_node.port_buffers120 system.ruby.network.ext_links01.int_node.port_buffers121 system.ruby.network.ext_links01.int_node.port_buffers122 system.ruby.network.ext_links01.int_node.port_buffers123 system.ruby.network.ext_links01.int_node.port_buffers124 system.ruby.network.ext_links01.int_node.port_buffers125 system.ruby.network.ext_links01.int_node.port_buffers126 system.ruby.network.ext_links01.int_node.port_buffers127 system.ruby.network.ext_links01.int_node.port_buffers128 system.ruby.network.ext_links01.int_node.port_buffers129 system.ruby.network.ext_links01.int_node.port_buffers130 system.ruby.network.ext_links01.int_node.port_buffers131 system.ruby.network.ext_links01.int_node.port_buffers132 system.ruby.network.ext_links01.int_node.port_buffers133 system.ruby.network.ext_links01.int_node.port_buffers134 system.ruby.network.ext_links01.int_node.port_buffers135 system.ruby.network.ext_links01.int_node.port_buffers136 system.ruby.network.ext_links01.int_node.port_buffers137 system.ruby.network.ext_links01.int_node.port_buffers138 system.ruby.network.ext_links01.int_node.port_buffers139 system.ruby.network.ext_links01.int_node.port_buffers140 system.ruby.network.ext_links01.int_node.port_buffers141 system.ruby.network.ext_links01.int_node.port_buffers142 system.ruby.network.ext_links01.int_node.port_buffers143 system.ruby.network.ext_links01.int_node.port_buffers144 system.ruby.network.ext_links01.int_node.port_buffers145 system.ruby.network.ext_links01.int_node.port_buffers146 system.ruby.network.ext_links01.int_node.port_buffers147 system.ruby.network.ext_links01.int_node.port_buffers148 system.ruby.network.ext_links01.int_node.port_buffers149
+router_id=1
+virt_nets=10
+
+[system.ruby.network.ext_links01.int_node.port_buffers000]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers001]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers002]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers003]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers004]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers005]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers006]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers007]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers008]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers009]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers010]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers011]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers012]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers013]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers014]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers015]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers016]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers017]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers018]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers019]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers020]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers021]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers022]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers023]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers024]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers025]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers026]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers027]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers028]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers029]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers030]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers031]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers032]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers033]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers034]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers035]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers036]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers037]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers038]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers039]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers040]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers041]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers042]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers043]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers044]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers045]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers046]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers047]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers048]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers049]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers050]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers051]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers052]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers053]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers054]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers055]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers056]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers057]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers058]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers059]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers060]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers061]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers062]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers063]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers064]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers065]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers066]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers067]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers068]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers069]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers070]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers071]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers072]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers073]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers074]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers075]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers076]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers077]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers078]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers079]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers080]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers081]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers082]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers083]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers084]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers085]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers086]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers087]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers088]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers089]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers090]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers091]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers092]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers093]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers094]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers095]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers096]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers097]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers098]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers099]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers100]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers101]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers102]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers103]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers104]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers105]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers106]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers107]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers108]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers109]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers110]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers111]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers112]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers113]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers114]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers115]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers116]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers117]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers118]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers119]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers120]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers121]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers122]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers123]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers124]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers125]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers126]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers127]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers128]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers129]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers130]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers131]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers132]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers133]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers134]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers135]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers136]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers137]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers138]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers139]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers140]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers141]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers142]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers143]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers144]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers145]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers146]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers147]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers148]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links01.int_node.port_buffers149]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02]
+type=SimpleExtLink
+children=int_node
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.tcp_cntrl0
+int_node=system.ruby.network.ext_links02.int_node
+latency=1
+link_id=2
+weight=1
+
+[system.ruby.network.ext_links02.int_node]
+type=Switch
+children=port_buffers000 port_buffers001 port_buffers002 port_buffers003 port_buffers004 port_buffers005 port_buffers006 port_buffers007 port_buffers008 port_buffers009 port_buffers010 port_buffers011 port_buffers012 port_buffers013 port_buffers014 port_buffers015 port_buffers016 port_buffers017 port_buffers018 port_buffers019 port_buffers020 port_buffers021 port_buffers022 port_buffers023 port_buffers024 port_buffers025 port_buffers026 port_buffers027 port_buffers028 port_buffers029 port_buffers030 port_buffers031 port_buffers032 port_buffers033 port_buffers034 port_buffers035 port_buffers036 port_buffers037 port_buffers038 port_buffers039 port_buffers040 port_buffers041 port_buffers042 port_buffers043 port_buffers044 port_buffers045 port_buffers046 port_buffers047 port_buffers048 port_buffers049 port_buffers050 port_buffers051 port_buffers052 port_buffers053 port_buffers054 port_buffers055 port_buffers056 port_buffers057 port_buffers058 port_buffers059 port_buffers060 port_buffers061 port_buffers062 port_buffers063 port_buffers064 port_buffers065 port_buffers066 port_buffers067 port_buffers068 port_buffers069 port_buffers070 port_buffers071 port_buffers072 port_buffers073 port_buffers074 port_buffers075 port_buffers076 port_buffers077 port_buffers078 port_buffers079 port_buffers080 port_buffers081 port_buffers082 port_buffers083 port_buffers084 port_buffers085 port_buffers086 port_buffers087 port_buffers088 port_buffers089 port_buffers090 port_buffers091 port_buffers092 port_buffers093 port_buffers094 port_buffers095 port_buffers096 port_buffers097 port_buffers098 port_buffers099 port_buffers100 port_buffers101 port_buffers102 port_buffers103 port_buffers104 port_buffers105 port_buffers106 port_buffers107 port_buffers108 port_buffers109 port_buffers110 port_buffers111 port_buffers112 port_buffers113 port_buffers114 port_buffers115 port_buffers116 port_buffers117 port_buffers118 port_buffers119 port_buffers120 port_buffers121 port_buffers122 port_buffers123 port_buffers124 port_buffers125 port_buffers126 port_buffers127 port_buffers128 port_buffers129 port_buffers130 port_buffers131 port_buffers132 port_buffers133 port_buffers134 port_buffers135 port_buffers136 port_buffers137 port_buffers138 port_buffers139 port_buffers140 port_buffers141 port_buffers142 port_buffers143 port_buffers144 port_buffers145 port_buffers146 port_buffers147 port_buffers148 port_buffers149
+clk_domain=system.ruby.clk_domain
+eventq_index=0
+port_buffers=system.ruby.network.ext_links02.int_node.port_buffers000 system.ruby.network.ext_links02.int_node.port_buffers001 system.ruby.network.ext_links02.int_node.port_buffers002 system.ruby.network.ext_links02.int_node.port_buffers003 system.ruby.network.ext_links02.int_node.port_buffers004 system.ruby.network.ext_links02.int_node.port_buffers005 system.ruby.network.ext_links02.int_node.port_buffers006 system.ruby.network.ext_links02.int_node.port_buffers007 system.ruby.network.ext_links02.int_node.port_buffers008 system.ruby.network.ext_links02.int_node.port_buffers009 system.ruby.network.ext_links02.int_node.port_buffers010 system.ruby.network.ext_links02.int_node.port_buffers011 system.ruby.network.ext_links02.int_node.port_buffers012 system.ruby.network.ext_links02.int_node.port_buffers013 system.ruby.network.ext_links02.int_node.port_buffers014 system.ruby.network.ext_links02.int_node.port_buffers015 system.ruby.network.ext_links02.int_node.port_buffers016 system.ruby.network.ext_links02.int_node.port_buffers017 system.ruby.network.ext_links02.int_node.port_buffers018 system.ruby.network.ext_links02.int_node.port_buffers019 system.ruby.network.ext_links02.int_node.port_buffers020 system.ruby.network.ext_links02.int_node.port_buffers021 system.ruby.network.ext_links02.int_node.port_buffers022 system.ruby.network.ext_links02.int_node.port_buffers023 system.ruby.network.ext_links02.int_node.port_buffers024 system.ruby.network.ext_links02.int_node.port_buffers025 system.ruby.network.ext_links02.int_node.port_buffers026 system.ruby.network.ext_links02.int_node.port_buffers027 system.ruby.network.ext_links02.int_node.port_buffers028 system.ruby.network.ext_links02.int_node.port_buffers029 system.ruby.network.ext_links02.int_node.port_buffers030 system.ruby.network.ext_links02.int_node.port_buffers031 system.ruby.network.ext_links02.int_node.port_buffers032 system.ruby.network.ext_links02.int_node.port_buffers033 system.ruby.network.ext_links02.int_node.port_buffers034 system.ruby.network.ext_links02.int_node.port_buffers035 system.ruby.network.ext_links02.int_node.port_buffers036 system.ruby.network.ext_links02.int_node.port_buffers037 system.ruby.network.ext_links02.int_node.port_buffers038 system.ruby.network.ext_links02.int_node.port_buffers039 system.ruby.network.ext_links02.int_node.port_buffers040 system.ruby.network.ext_links02.int_node.port_buffers041 system.ruby.network.ext_links02.int_node.port_buffers042 system.ruby.network.ext_links02.int_node.port_buffers043 system.ruby.network.ext_links02.int_node.port_buffers044 system.ruby.network.ext_links02.int_node.port_buffers045 system.ruby.network.ext_links02.int_node.port_buffers046 system.ruby.network.ext_links02.int_node.port_buffers047 system.ruby.network.ext_links02.int_node.port_buffers048 system.ruby.network.ext_links02.int_node.port_buffers049 system.ruby.network.ext_links02.int_node.port_buffers050 system.ruby.network.ext_links02.int_node.port_buffers051 system.ruby.network.ext_links02.int_node.port_buffers052 system.ruby.network.ext_links02.int_node.port_buffers053 system.ruby.network.ext_links02.int_node.port_buffers054 system.ruby.network.ext_links02.int_node.port_buffers055 system.ruby.network.ext_links02.int_node.port_buffers056 system.ruby.network.ext_links02.int_node.port_buffers057 system.ruby.network.ext_links02.int_node.port_buffers058 system.ruby.network.ext_links02.int_node.port_buffers059 system.ruby.network.ext_links02.int_node.port_buffers060 system.ruby.network.ext_links02.int_node.port_buffers061 system.ruby.network.ext_links02.int_node.port_buffers062 system.ruby.network.ext_links02.int_node.port_buffers063 system.ruby.network.ext_links02.int_node.port_buffers064 system.ruby.network.ext_links02.int_node.port_buffers065 system.ruby.network.ext_links02.int_node.port_buffers066 system.ruby.network.ext_links02.int_node.port_buffers067 system.ruby.network.ext_links02.int_node.port_buffers068 system.ruby.network.ext_links02.int_node.port_buffers069 system.ruby.network.ext_links02.int_node.port_buffers070 system.ruby.network.ext_links02.int_node.port_buffers071 system.ruby.network.ext_links02.int_node.port_buffers072 system.ruby.network.ext_links02.int_node.port_buffers073 system.ruby.network.ext_links02.int_node.port_buffers074 system.ruby.network.ext_links02.int_node.port_buffers075 system.ruby.network.ext_links02.int_node.port_buffers076 system.ruby.network.ext_links02.int_node.port_buffers077 system.ruby.network.ext_links02.int_node.port_buffers078 system.ruby.network.ext_links02.int_node.port_buffers079 system.ruby.network.ext_links02.int_node.port_buffers080 system.ruby.network.ext_links02.int_node.port_buffers081 system.ruby.network.ext_links02.int_node.port_buffers082 system.ruby.network.ext_links02.int_node.port_buffers083 system.ruby.network.ext_links02.int_node.port_buffers084 system.ruby.network.ext_links02.int_node.port_buffers085 system.ruby.network.ext_links02.int_node.port_buffers086 system.ruby.network.ext_links02.int_node.port_buffers087 system.ruby.network.ext_links02.int_node.port_buffers088 system.ruby.network.ext_links02.int_node.port_buffers089 system.ruby.network.ext_links02.int_node.port_buffers090 system.ruby.network.ext_links02.int_node.port_buffers091 system.ruby.network.ext_links02.int_node.port_buffers092 system.ruby.network.ext_links02.int_node.port_buffers093 system.ruby.network.ext_links02.int_node.port_buffers094 system.ruby.network.ext_links02.int_node.port_buffers095 system.ruby.network.ext_links02.int_node.port_buffers096 system.ruby.network.ext_links02.int_node.port_buffers097 system.ruby.network.ext_links02.int_node.port_buffers098 system.ruby.network.ext_links02.int_node.port_buffers099 system.ruby.network.ext_links02.int_node.port_buffers100 system.ruby.network.ext_links02.int_node.port_buffers101 system.ruby.network.ext_links02.int_node.port_buffers102 system.ruby.network.ext_links02.int_node.port_buffers103 system.ruby.network.ext_links02.int_node.port_buffers104 system.ruby.network.ext_links02.int_node.port_buffers105 system.ruby.network.ext_links02.int_node.port_buffers106 system.ruby.network.ext_links02.int_node.port_buffers107 system.ruby.network.ext_links02.int_node.port_buffers108 system.ruby.network.ext_links02.int_node.port_buffers109 system.ruby.network.ext_links02.int_node.port_buffers110 system.ruby.network.ext_links02.int_node.port_buffers111 system.ruby.network.ext_links02.int_node.port_buffers112 system.ruby.network.ext_links02.int_node.port_buffers113 system.ruby.network.ext_links02.int_node.port_buffers114 system.ruby.network.ext_links02.int_node.port_buffers115 system.ruby.network.ext_links02.int_node.port_buffers116 system.ruby.network.ext_links02.int_node.port_buffers117 system.ruby.network.ext_links02.int_node.port_buffers118 system.ruby.network.ext_links02.int_node.port_buffers119 system.ruby.network.ext_links02.int_node.port_buffers120 system.ruby.network.ext_links02.int_node.port_buffers121 system.ruby.network.ext_links02.int_node.port_buffers122 system.ruby.network.ext_links02.int_node.port_buffers123 system.ruby.network.ext_links02.int_node.port_buffers124 system.ruby.network.ext_links02.int_node.port_buffers125 system.ruby.network.ext_links02.int_node.port_buffers126 system.ruby.network.ext_links02.int_node.port_buffers127 system.ruby.network.ext_links02.int_node.port_buffers128 system.ruby.network.ext_links02.int_node.port_buffers129 system.ruby.network.ext_links02.int_node.port_buffers130 system.ruby.network.ext_links02.int_node.port_buffers131 system.ruby.network.ext_links02.int_node.port_buffers132 system.ruby.network.ext_links02.int_node.port_buffers133 system.ruby.network.ext_links02.int_node.port_buffers134 system.ruby.network.ext_links02.int_node.port_buffers135 system.ruby.network.ext_links02.int_node.port_buffers136 system.ruby.network.ext_links02.int_node.port_buffers137 system.ruby.network.ext_links02.int_node.port_buffers138 system.ruby.network.ext_links02.int_node.port_buffers139 system.ruby.network.ext_links02.int_node.port_buffers140 system.ruby.network.ext_links02.int_node.port_buffers141 system.ruby.network.ext_links02.int_node.port_buffers142 system.ruby.network.ext_links02.int_node.port_buffers143 system.ruby.network.ext_links02.int_node.port_buffers144 system.ruby.network.ext_links02.int_node.port_buffers145 system.ruby.network.ext_links02.int_node.port_buffers146 system.ruby.network.ext_links02.int_node.port_buffers147 system.ruby.network.ext_links02.int_node.port_buffers148 system.ruby.network.ext_links02.int_node.port_buffers149
+router_id=2
+virt_nets=10
+
+[system.ruby.network.ext_links02.int_node.port_buffers000]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers001]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers002]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers003]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers004]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers005]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers006]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers007]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers008]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers009]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers010]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers011]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers012]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers013]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers014]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers015]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers016]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers017]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers018]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers019]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers020]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers021]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers022]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers023]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers024]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers025]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers026]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers027]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers028]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers029]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers030]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers031]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers032]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers033]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers034]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers035]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers036]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers037]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers038]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers039]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers040]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers041]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers042]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers043]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers044]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers045]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers046]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers047]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers048]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers049]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers050]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers051]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers052]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers053]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers054]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers055]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers056]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers057]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers058]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers059]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers060]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers061]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers062]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers063]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers064]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers065]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers066]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers067]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers068]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers069]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers070]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers071]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers072]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers073]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers074]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers075]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers076]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers077]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers078]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers079]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers080]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers081]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers082]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers083]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers084]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers085]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers086]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers087]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers088]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers089]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers090]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers091]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers092]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers093]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers094]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers095]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers096]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers097]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers098]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers099]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers100]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers101]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers102]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers103]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers104]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers105]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers106]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers107]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers108]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers109]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers110]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers111]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers112]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers113]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers114]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers115]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers116]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers117]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers118]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers119]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers120]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers121]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers122]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers123]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers124]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers125]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers126]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers127]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers128]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers129]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers130]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers131]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers132]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers133]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers134]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers135]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers136]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers137]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers138]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers139]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers140]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers141]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers142]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers143]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers144]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers145]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers146]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers147]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers148]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links02.int_node.port_buffers149]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.ext_links03]
+type=SimpleExtLink
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.tcp_cntrl1
+int_node=system.ruby.network.ext_links02.int_node
+latency=1
+link_id=3
+weight=1
+
+[system.ruby.network.ext_links04]
+type=SimpleExtLink
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.tcp_cntrl2
+int_node=system.ruby.network.ext_links02.int_node
+latency=1
+link_id=4
+weight=1
+
+[system.ruby.network.ext_links05]
+type=SimpleExtLink
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.tcp_cntrl3
+int_node=system.ruby.network.ext_links02.int_node
+latency=1
+link_id=5
+weight=1
+
+[system.ruby.network.ext_links06]
+type=SimpleExtLink
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.tcp_cntrl4
+int_node=system.ruby.network.ext_links02.int_node
+latency=1
+link_id=6
+weight=1
+
+[system.ruby.network.ext_links07]
+type=SimpleExtLink
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.tcp_cntrl5
+int_node=system.ruby.network.ext_links02.int_node
+latency=1
+link_id=7
+weight=1
+
+[system.ruby.network.ext_links08]
+type=SimpleExtLink
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.tcp_cntrl6
+int_node=system.ruby.network.ext_links02.int_node
+latency=1
+link_id=8
+weight=1
+
+[system.ruby.network.ext_links09]
+type=SimpleExtLink
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.tcp_cntrl7
+int_node=system.ruby.network.ext_links02.int_node
+latency=1
+link_id=9
+weight=1
+
+[system.ruby.network.ext_links10]
+type=SimpleExtLink
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.sqc_cntrl0
+int_node=system.ruby.network.ext_links02.int_node
+latency=1
+link_id=10
+weight=1
+
+[system.ruby.network.ext_links11]
+type=SimpleExtLink
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.sqc_cntrl1
+int_node=system.ruby.network.ext_links02.int_node
+latency=1
+link_id=11
+weight=1
+
+[system.ruby.network.ext_links12]
+type=SimpleExtLink
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.tcc_cntrl0
+int_node=system.ruby.network.ext_links02.int_node
+latency=1
+link_id=12
+weight=1
+
+[system.ruby.network.ext_links13]
+type=SimpleExtLink
+bandwidth_factor=512
+eventq_index=0
+ext_node=system.tccdir_cntrl0
+int_node=system.ruby.network.ext_links02.int_node
+latency=1
+link_id=13
+weight=1
+
+[system.ruby.network.int_link_buffers00]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers01]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers02]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers03]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers04]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers05]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers06]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers07]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers08]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers09]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers10]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers11]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers12]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers13]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers14]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers15]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers16]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers17]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers18]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers19]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers20]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers21]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers22]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers23]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers24]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers25]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers26]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers27]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers28]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers29]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers30]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers31]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers32]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers33]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers34]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers35]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers36]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers37]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers38]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_link_buffers39]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.ruby.network.int_links0]
+type=SimpleIntLink
+bandwidth_factor=512
+eventq_index=0
+latency=1
+link_id=0
+node_a=system.ruby.network.ext_links00.int_node
+node_b=system.ruby.network.ext_links01.int_node
+weight=1
+
+[system.ruby.network.int_links1]
+type=SimpleIntLink
+bandwidth_factor=512
+eventq_index=0
+latency=1
+link_id=1
+node_a=system.ruby.network.ext_links00.int_node
+node_b=system.ruby.network.ext_links02.int_node
+weight=1
+
+[system.sqc_cntrl0]
+type=SQC_Controller
+children=L1cache mandatoryQueue probeToSQC requestFromSQC responseFromSQC responseToSQC sequencer unblockFromCore
+L1cache=system.sqc_cntrl0.L1cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+eventq_index=0
+issue_latency=80
+l2_hit_latency=18
+mandatoryQueue=system.sqc_cntrl0.mandatoryQueue
+number_of_TBEs=256
+probeToSQC=system.sqc_cntrl0.probeToSQC
+recycle_latency=10
+requestFromSQC=system.sqc_cntrl0.requestFromSQC
+responseFromSQC=system.sqc_cntrl0.responseFromSQC
+responseToSQC=system.sqc_cntrl0.responseToSQC
+ruby_system=system.ruby
+sequencer=system.sqc_cntrl0.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.sqc_cntrl0.unblockFromCore
+version=0
+
+[system.sqc_cntrl0.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.sqc_cntrl0.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=32768
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=4
+
+[system.sqc_cntrl0.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=32768
+
+[system.sqc_cntrl0.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.sqc_cntrl0.probeToSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[21]
+
+[system.sqc_cntrl0.requestFromSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[29]
+
+[system.sqc_cntrl0.responseFromSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[30]
+
+[system.sqc_cntrl0.responseToSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[22]
+
+[system.sqc_cntrl0.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.sqc_cntrl0.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.sqc_cntrl0.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_outstanding_requests=16
+no_retry_on_stall=true
+ruby_system=system.ruby
+support_data_reqs=false
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=true
+version=18
+slave=system.cpu.cpuInstPort[0]
+
+[system.sqc_cntrl0.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[31]
+
+[system.sqc_cntrl1]
+type=SQC_Controller
+children=L1cache mandatoryQueue probeToSQC requestFromSQC responseFromSQC responseToSQC sequencer unblockFromCore
+L1cache=system.sqc_cntrl1.L1cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+eventq_index=0
+issue_latency=80
+l2_hit_latency=18
+mandatoryQueue=system.sqc_cntrl1.mandatoryQueue
+number_of_TBEs=256
+probeToSQC=system.sqc_cntrl1.probeToSQC
+recycle_latency=10
+requestFromSQC=system.sqc_cntrl1.requestFromSQC
+responseFromSQC=system.sqc_cntrl1.responseFromSQC
+responseToSQC=system.sqc_cntrl1.responseToSQC
+ruby_system=system.ruby
+sequencer=system.sqc_cntrl1.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.sqc_cntrl1.unblockFromCore
+version=1
+
+[system.sqc_cntrl1.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.sqc_cntrl1.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=32768
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=4
+
+[system.sqc_cntrl1.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=32768
+
+[system.sqc_cntrl1.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.sqc_cntrl1.probeToSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[23]
+
+[system.sqc_cntrl1.requestFromSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[32]
+
+[system.sqc_cntrl1.responseFromSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[33]
+
+[system.sqc_cntrl1.responseToSQC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[24]
+
+[system.sqc_cntrl1.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.sqc_cntrl1.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.sqc_cntrl1.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_outstanding_requests=16
+no_retry_on_stall=true
+ruby_system=system.ruby
+support_data_reqs=false
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=true
+version=19
+slave=system.cpu.cpuInstPort[1]
+
+[system.sqc_cntrl1.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[34]
+
+[system.sys_port_proxy]
+type=RubyPortProxy
+clk_domain=system.clk_domain
+eventq_index=0
+is_cpu_sequencer=true
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_ruby_tester=false
+version=0
+slave=system.system_port
+
+[system.tcc_cntrl0]
+type=TCC_Controller
+children=L2cache responseFromTCC responseToTCC w_TCCUnblockToTCCDir w_probeToTCC w_reqToTCC w_reqToTCCDir w_respToTCC w_respToTCCDir
+L2cache=system.tcc_cntrl0.L2cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+eventq_index=0
+l2_request_latency=1
+l2_response_latency=16
+number_of_TBEs=2048
+recycle_latency=10
+responseFromTCC=system.tcc_cntrl0.responseFromTCC
+responseToTCC=system.tcc_cntrl0.responseToTCC
+ruby_system=system.ruby
+system=system
+transitions_per_cycle=32
+version=0
+w_TCCUnblockToTCCDir=system.tcc_cntrl0.w_TCCUnblockToTCCDir
+w_probeToTCC=system.tcc_cntrl0.w_probeToTCC
+w_reqToTCC=system.tcc_cntrl0.w_reqToTCC
+w_reqToTCCDir=system.tcc_cntrl0.w_reqToTCCDir
+w_respToTCC=system.tcc_cntrl0.w_respToTCC
+w_respToTCCDir=system.tcc_cntrl0.w_respToTCCDir
+
+[system.tcc_cntrl0.L2cache]
+type=RubyCache
+children=replacement_policy
+assoc=16
+block_size=0
+dataAccessLatency=8
+dataArrayBanks=256
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcc_cntrl0.L2cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=262144.0
+start_index_bit=6
+tagAccessLatency=2
+tagArrayBanks=256
+
+[system.tcc_cntrl0.L2cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=16
+block_size=64
+eventq_index=0
+size=262144.0
+
+[system.tcc_cntrl0.responseFromTCC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[35]
+
+[system.tcc_cntrl0.responseToTCC]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[25]
+
+[system.tcc_cntrl0.w_TCCUnblockToTCCDir]
+type=RubyWireBuffer
+eventq_index=0
+ruby_system=system.ruby
+
+[system.tcc_cntrl0.w_probeToTCC]
+type=RubyWireBuffer
+eventq_index=0
+ruby_system=system.ruby
+
+[system.tcc_cntrl0.w_reqToTCC]
+type=RubyWireBuffer
+eventq_index=0
+ruby_system=system.ruby
+
+[system.tcc_cntrl0.w_reqToTCCDir]
+type=RubyWireBuffer
+eventq_index=0
+ruby_system=system.ruby
+
+[system.tcc_cntrl0.w_respToTCC]
+type=RubyWireBuffer
+eventq_index=0
+ruby_system=system.ruby
+
+[system.tcc_cntrl0.w_respToTCCDir]
+type=RubyWireBuffer
+eventq_index=0
+ruby_system=system.ruby
+
+[system.tccdir_cntrl0]
+type=TCCdir_Controller
+children=directory probeFromNB probeToCore requestFromTCP requestToNB responseFromNB responseFromTCP responseToCore responseToNB triggerQueue unblockFromTCP unblockToNB
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+directory=system.tccdir_cntrl0.directory
+directory_latency=6
+eventq_index=0
+issue_latency=120
+number_of_TBEs=1024
+probeFromNB=system.tccdir_cntrl0.probeFromNB
+probeToCore=system.tccdir_cntrl0.probeToCore
+recycle_latency=10
+requestFromTCP=system.tccdir_cntrl0.requestFromTCP
+requestToNB=system.tccdir_cntrl0.requestToNB
+responseFromNB=system.tccdir_cntrl0.responseFromNB
+responseFromTCP=system.tccdir_cntrl0.responseFromTCP
+responseToCore=system.tccdir_cntrl0.responseToCore
+responseToNB=system.tccdir_cntrl0.responseToNB
+response_latency=5
+ruby_system=system.ruby
+system=system
+transitions_per_cycle=32
+triggerQueue=system.tccdir_cntrl0.triggerQueue
+unblockFromTCP=system.tccdir_cntrl0.unblockFromTCP
+unblockToNB=system.tccdir_cntrl0.unblockToNB
+version=0
+w_TCCUnblockToTCCDir=system.tcc_cntrl0.w_TCCUnblockToTCCDir
+w_probeToTCC=system.tcc_cntrl0.w_probeToTCC
+w_reqToTCC=system.tcc_cntrl0.w_reqToTCC
+w_reqToTCCDir=system.tcc_cntrl0.w_reqToTCCDir
+w_respToTCC=system.tcc_cntrl0.w_respToTCC
+w_respToTCCDir=system.tcc_cntrl0.w_respToTCCDir
+
+[system.tccdir_cntrl0.directory]
+type=RubyCache
+children=replacement_policy
+assoc=16
+block_size=0
+dataAccessLatency=1
+dataArrayBanks=1
+eventq_index=0
+is_icache=false
+replacement_policy=system.tccdir_cntrl0.directory.replacement_policy
+resourceStalls=false
+ruby_system=system.ruby
+size=786432
+start_index_bit=6
+tagAccessLatency=1
+tagArrayBanks=1
+
+[system.tccdir_cntrl0.directory.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=16
+block_size=64
+eventq_index=0
+size=786432
+
+[system.tccdir_cntrl0.probeFromNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[29]
+
+[system.tccdir_cntrl0.probeToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[36]
+
+[system.tccdir_cntrl0.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[26]
+
+[system.tccdir_cntrl0.requestToNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[38]
+
+[system.tccdir_cntrl0.responseFromNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+slave=system.ruby.network.master[30]
+
+[system.tccdir_cntrl0.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[27]
+
+[system.tccdir_cntrl0.responseToCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[37]
+
+[system.tccdir_cntrl0.responseToNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[39]
+
+[system.tccdir_cntrl0.triggerQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+
+[system.tccdir_cntrl0.unblockFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[28]
+
+[system.tccdir_cntrl0.unblockToNB]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+master=system.ruby.network.slave[40]
+
+[system.tcp_cntrl0]
+type=TCP_Controller
+children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore
+L1cache=system.tcp_cntrl0.L1cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+coalescer=system.tcp_cntrl0.coalescer
+eventq_index=0
+issue_latency=40
+l2_hit_latency=18
+mandatoryQueue=system.tcp_cntrl0.mandatoryQueue
+number_of_TBEs=2560
+probeToTCP=system.tcp_cntrl0.probeToTCP
+recycle_latency=10
+requestFromTCP=system.tcp_cntrl0.requestFromTCP
+responseFromTCP=system.tcp_cntrl0.responseFromTCP
+responseToTCP=system.tcp_cntrl0.responseToTCP
+ruby_system=system.ruby
+sequencer=system.tcp_cntrl0.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.tcp_cntrl0.unblockFromCore
+use_seq_not_coal=false
+version=0
+
+[system.tcp_cntrl0.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcp_cntrl0.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=16384
+start_index_bit=6
+tagAccessLatency=4
+tagArrayBanks=4
+
+[system.tcp_cntrl0.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=16384
+
+[system.tcp_cntrl0.coalescer]
+type=RubyGPUCoalescer
+assume_rfo=true
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl0.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl0.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_outstanding_requests=2560
+no_retry_on_stall=true
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=false
+system=system
+using_network_tester=false
+using_ruby_tester=true
+version=2
+slave=system.cpu.cpuDataPort[0]
+
+[system.tcp_cntrl0.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.tcp_cntrl0.probeToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[5]
+
+[system.tcp_cntrl0.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[5]
+
+[system.tcp_cntrl0.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[6]
+
+[system.tcp_cntrl0.responseToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[6]
+
+[system.tcp_cntrl0.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl0.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl0.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=3
+
+[system.tcp_cntrl0.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[7]
+
+[system.tcp_cntrl1]
+type=TCP_Controller
+children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore
+L1cache=system.tcp_cntrl1.L1cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+coalescer=system.tcp_cntrl1.coalescer
+eventq_index=0
+issue_latency=40
+l2_hit_latency=18
+mandatoryQueue=system.tcp_cntrl1.mandatoryQueue
+number_of_TBEs=2560
+probeToTCP=system.tcp_cntrl1.probeToTCP
+recycle_latency=10
+requestFromTCP=system.tcp_cntrl1.requestFromTCP
+responseFromTCP=system.tcp_cntrl1.responseFromTCP
+responseToTCP=system.tcp_cntrl1.responseToTCP
+ruby_system=system.ruby
+sequencer=system.tcp_cntrl1.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.tcp_cntrl1.unblockFromCore
+use_seq_not_coal=false
+version=1
+
+[system.tcp_cntrl1.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcp_cntrl1.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=16384
+start_index_bit=6
+tagAccessLatency=4
+tagArrayBanks=4
+
+[system.tcp_cntrl1.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=16384
+
+[system.tcp_cntrl1.coalescer]
+type=RubyGPUCoalescer
+assume_rfo=true
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl1.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl1.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_outstanding_requests=2560
+no_retry_on_stall=true
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=false
+system=system
+using_network_tester=false
+using_ruby_tester=true
+version=4
+slave=system.cpu.cpuDataPort[1]
+
+[system.tcp_cntrl1.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.tcp_cntrl1.probeToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[7]
+
+[system.tcp_cntrl1.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[8]
+
+[system.tcp_cntrl1.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[9]
+
+[system.tcp_cntrl1.responseToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[8]
+
+[system.tcp_cntrl1.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl1.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl1.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=5
+
+[system.tcp_cntrl1.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[10]
+
+[system.tcp_cntrl2]
+type=TCP_Controller
+children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore
+L1cache=system.tcp_cntrl2.L1cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+coalescer=system.tcp_cntrl2.coalescer
+eventq_index=0
+issue_latency=40
+l2_hit_latency=18
+mandatoryQueue=system.tcp_cntrl2.mandatoryQueue
+number_of_TBEs=2560
+probeToTCP=system.tcp_cntrl2.probeToTCP
+recycle_latency=10
+requestFromTCP=system.tcp_cntrl2.requestFromTCP
+responseFromTCP=system.tcp_cntrl2.responseFromTCP
+responseToTCP=system.tcp_cntrl2.responseToTCP
+ruby_system=system.ruby
+sequencer=system.tcp_cntrl2.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.tcp_cntrl2.unblockFromCore
+use_seq_not_coal=false
+version=2
+
+[system.tcp_cntrl2.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcp_cntrl2.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=16384
+start_index_bit=6
+tagAccessLatency=4
+tagArrayBanks=4
+
+[system.tcp_cntrl2.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=16384
+
+[system.tcp_cntrl2.coalescer]
+type=RubyGPUCoalescer
+assume_rfo=true
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl2.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl2.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_outstanding_requests=2560
+no_retry_on_stall=true
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=false
+system=system
+using_network_tester=false
+using_ruby_tester=true
+version=6
+slave=system.cpu.cpuDataPort[2]
+
+[system.tcp_cntrl2.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.tcp_cntrl2.probeToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[9]
+
+[system.tcp_cntrl2.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[11]
+
+[system.tcp_cntrl2.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[12]
+
+[system.tcp_cntrl2.responseToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[10]
+
+[system.tcp_cntrl2.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl2.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl2.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=7
+
+[system.tcp_cntrl2.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[13]
+
+[system.tcp_cntrl3]
+type=TCP_Controller
+children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore
+L1cache=system.tcp_cntrl3.L1cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+coalescer=system.tcp_cntrl3.coalescer
+eventq_index=0
+issue_latency=40
+l2_hit_latency=18
+mandatoryQueue=system.tcp_cntrl3.mandatoryQueue
+number_of_TBEs=2560
+probeToTCP=system.tcp_cntrl3.probeToTCP
+recycle_latency=10
+requestFromTCP=system.tcp_cntrl3.requestFromTCP
+responseFromTCP=system.tcp_cntrl3.responseFromTCP
+responseToTCP=system.tcp_cntrl3.responseToTCP
+ruby_system=system.ruby
+sequencer=system.tcp_cntrl3.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.tcp_cntrl3.unblockFromCore
+use_seq_not_coal=false
+version=3
+
+[system.tcp_cntrl3.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcp_cntrl3.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=16384
+start_index_bit=6
+tagAccessLatency=4
+tagArrayBanks=4
+
+[system.tcp_cntrl3.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=16384
+
+[system.tcp_cntrl3.coalescer]
+type=RubyGPUCoalescer
+assume_rfo=true
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl3.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl3.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_outstanding_requests=2560
+no_retry_on_stall=true
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=false
+system=system
+using_network_tester=false
+using_ruby_tester=true
+version=8
+slave=system.cpu.cpuDataPort[3]
+
+[system.tcp_cntrl3.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.tcp_cntrl3.probeToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[11]
+
+[system.tcp_cntrl3.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[14]
+
+[system.tcp_cntrl3.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[15]
+
+[system.tcp_cntrl3.responseToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[12]
+
+[system.tcp_cntrl3.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl3.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl3.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=9
+
+[system.tcp_cntrl3.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[16]
+
+[system.tcp_cntrl4]
+type=TCP_Controller
+children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore
+L1cache=system.tcp_cntrl4.L1cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+coalescer=system.tcp_cntrl4.coalescer
+eventq_index=0
+issue_latency=40
+l2_hit_latency=18
+mandatoryQueue=system.tcp_cntrl4.mandatoryQueue
+number_of_TBEs=2560
+probeToTCP=system.tcp_cntrl4.probeToTCP
+recycle_latency=10
+requestFromTCP=system.tcp_cntrl4.requestFromTCP
+responseFromTCP=system.tcp_cntrl4.responseFromTCP
+responseToTCP=system.tcp_cntrl4.responseToTCP
+ruby_system=system.ruby
+sequencer=system.tcp_cntrl4.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.tcp_cntrl4.unblockFromCore
+use_seq_not_coal=false
+version=4
+
+[system.tcp_cntrl4.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcp_cntrl4.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=16384
+start_index_bit=6
+tagAccessLatency=4
+tagArrayBanks=4
+
+[system.tcp_cntrl4.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=16384
+
+[system.tcp_cntrl4.coalescer]
+type=RubyGPUCoalescer
+assume_rfo=true
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl4.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl4.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_outstanding_requests=2560
+no_retry_on_stall=true
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=false
+system=system
+using_network_tester=false
+using_ruby_tester=true
+version=10
+slave=system.cpu.cpuDataPort[4]
+
+[system.tcp_cntrl4.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.tcp_cntrl4.probeToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[13]
+
+[system.tcp_cntrl4.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[17]
+
+[system.tcp_cntrl4.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[18]
+
+[system.tcp_cntrl4.responseToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[14]
+
+[system.tcp_cntrl4.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl4.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl4.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=11
+
+[system.tcp_cntrl4.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[19]
+
+[system.tcp_cntrl5]
+type=TCP_Controller
+children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore
+L1cache=system.tcp_cntrl5.L1cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+coalescer=system.tcp_cntrl5.coalescer
+eventq_index=0
+issue_latency=40
+l2_hit_latency=18
+mandatoryQueue=system.tcp_cntrl5.mandatoryQueue
+number_of_TBEs=2560
+probeToTCP=system.tcp_cntrl5.probeToTCP
+recycle_latency=10
+requestFromTCP=system.tcp_cntrl5.requestFromTCP
+responseFromTCP=system.tcp_cntrl5.responseFromTCP
+responseToTCP=system.tcp_cntrl5.responseToTCP
+ruby_system=system.ruby
+sequencer=system.tcp_cntrl5.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.tcp_cntrl5.unblockFromCore
+use_seq_not_coal=false
+version=5
+
+[system.tcp_cntrl5.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcp_cntrl5.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=16384
+start_index_bit=6
+tagAccessLatency=4
+tagArrayBanks=4
+
+[system.tcp_cntrl5.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=16384
+
+[system.tcp_cntrl5.coalescer]
+type=RubyGPUCoalescer
+assume_rfo=true
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl5.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl5.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_outstanding_requests=2560
+no_retry_on_stall=true
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=false
+system=system
+using_network_tester=false
+using_ruby_tester=true
+version=12
+slave=system.cpu.cpuDataPort[5]
+
+[system.tcp_cntrl5.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.tcp_cntrl5.probeToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[15]
+
+[system.tcp_cntrl5.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[20]
+
+[system.tcp_cntrl5.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[21]
+
+[system.tcp_cntrl5.responseToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[16]
+
+[system.tcp_cntrl5.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl5.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl5.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=13
+
+[system.tcp_cntrl5.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[22]
+
+[system.tcp_cntrl6]
+type=TCP_Controller
+children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore
+L1cache=system.tcp_cntrl6.L1cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+coalescer=system.tcp_cntrl6.coalescer
+eventq_index=0
+issue_latency=40
+l2_hit_latency=18
+mandatoryQueue=system.tcp_cntrl6.mandatoryQueue
+number_of_TBEs=2560
+probeToTCP=system.tcp_cntrl6.probeToTCP
+recycle_latency=10
+requestFromTCP=system.tcp_cntrl6.requestFromTCP
+responseFromTCP=system.tcp_cntrl6.responseFromTCP
+responseToTCP=system.tcp_cntrl6.responseToTCP
+ruby_system=system.ruby
+sequencer=system.tcp_cntrl6.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.tcp_cntrl6.unblockFromCore
+use_seq_not_coal=false
+version=6
+
+[system.tcp_cntrl6.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcp_cntrl6.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=16384
+start_index_bit=6
+tagAccessLatency=4
+tagArrayBanks=4
+
+[system.tcp_cntrl6.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=16384
+
+[system.tcp_cntrl6.coalescer]
+type=RubyGPUCoalescer
+assume_rfo=true
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl6.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl6.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_outstanding_requests=2560
+no_retry_on_stall=true
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=false
+system=system
+using_network_tester=false
+using_ruby_tester=true
+version=14
+slave=system.cpu.cpuDataPort[6]
+
+[system.tcp_cntrl6.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.tcp_cntrl6.probeToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[17]
+
+[system.tcp_cntrl6.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[23]
+
+[system.tcp_cntrl6.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[24]
+
+[system.tcp_cntrl6.responseToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[18]
+
+[system.tcp_cntrl6.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl6.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl6.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=15
+
+[system.tcp_cntrl6.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[25]
+
+[system.tcp_cntrl7]
+type=TCP_Controller
+children=L1cache coalescer mandatoryQueue probeToTCP requestFromTCP responseFromTCP responseToTCP sequencer unblockFromCore
+L1cache=system.tcp_cntrl7.L1cache
+TCC_select_num_bits=0
+buffer_size=0
+clk_domain=system.clk_domain
+cluster_id=0
+coalescer=system.tcp_cntrl7.coalescer
+eventq_index=0
+issue_latency=40
+l2_hit_latency=18
+mandatoryQueue=system.tcp_cntrl7.mandatoryQueue
+number_of_TBEs=2560
+probeToTCP=system.tcp_cntrl7.probeToTCP
+recycle_latency=10
+requestFromTCP=system.tcp_cntrl7.requestFromTCP
+responseFromTCP=system.tcp_cntrl7.responseFromTCP
+responseToTCP=system.tcp_cntrl7.responseToTCP
+ruby_system=system.ruby
+sequencer=system.tcp_cntrl7.sequencer
+system=system
+transitions_per_cycle=32
+unblockFromCore=system.tcp_cntrl7.unblockFromCore
+use_seq_not_coal=false
+version=7
+
+[system.tcp_cntrl7.L1cache]
+type=RubyCache
+children=replacement_policy
+assoc=8
+block_size=0
+dataAccessLatency=4
+dataArrayBanks=16
+eventq_index=0
+is_icache=false
+replacement_policy=system.tcp_cntrl7.L1cache.replacement_policy
+resourceStalls=true
+ruby_system=system.ruby
+size=16384
+start_index_bit=6
+tagAccessLatency=4
+tagArrayBanks=4
+
+[system.tcp_cntrl7.L1cache.replacement_policy]
+type=PseudoLRUReplacementPolicy
+assoc=8
+block_size=64
+eventq_index=0
+size=16384
+
+[system.tcp_cntrl7.coalescer]
+type=RubyGPUCoalescer
+assume_rfo=true
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl7.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl7.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=false
+max_outstanding_requests=2560
+no_retry_on_stall=true
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=false
+system=system
+using_network_tester=false
+using_ruby_tester=true
+version=16
+slave=system.cpu.cpuDataPort[7]
+
+[system.tcp_cntrl7.mandatoryQueue]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=false
+randomization=false
+
+[system.tcp_cntrl7.probeToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[19]
+
+[system.tcp_cntrl7.requestFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[26]
+
+[system.tcp_cntrl7.responseFromTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[27]
+
+[system.tcp_cntrl7.responseToTCP]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+slave=system.ruby.network.master[20]
+
+[system.tcp_cntrl7.sequencer]
+type=RubySequencer
+clk_domain=system.clk_domain
+coreid=99
+dcache=system.tcp_cntrl7.L1cache
+dcache_hit_latency=1
+deadlock_threshold=500000
+eventq_index=0
+icache=system.tcp_cntrl7.L1cache
+icache_hit_latency=1
+is_cpu_sequencer=true
+max_outstanding_requests=16
+no_retry_on_stall=false
+ruby_system=system.ruby
+support_data_reqs=true
+support_inst_reqs=true
+system=system
+using_network_tester=false
+using_ruby_tester=false
+version=17
+
+[system.tcp_cntrl7.unblockFromCore]
+type=MessageBuffer
+buffer_size=0
+eventq_index=0
+ordered=true
+randomization=false
+master=system.ruby.network.slave[28]
+
+[system.voltage_domain]
+type=VoltageDomain
+eventq_index=0
+voltage=1.000000
+
diff --git a/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/simerr b/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/simerr
new file mode 100755
index 000000000..13060c953
--- /dev/null
+++ b/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/simerr
@@ -0,0 +1,10 @@
+warn: system.ruby.network adopting orphan SimObject param 'int_links'
+warn: system.ruby.network adopting orphan SimObject param 'ext_links'
+warn: rounding error > tolerance
+    1.250000 rounded to 1
+warn: rounding error > tolerance
+    1.250000 rounded to 1
+warn: rounding error > tolerance
+    1.250000 rounded to 1
+warn: DRAM device capacity (8192 Mbytes) does not match the address range assigned (256 Mbytes)
+warn: Replacement policy updates recently became the responsibility of SLICC state machines. Make sure to setMRU() near callbacks in .sm files!
diff --git a/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/simout b/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/simout
new file mode 100755
index 000000000..62d7346d7
--- /dev/null
+++ b/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/simout
@@ -0,0 +1,11 @@
+gem5 Simulator System.  http://gem5.org
+gem5 is copyrighted software; use the --copyright option for details.
+
+gem5 compiled Jan 19 2016 13:28:55
+gem5 started Jan 19 2016 13:29:16
+gem5 executing on zizzer, pid 48851
+command line: build/HSAIL_X86/gem5.opt -d build/HSAIL_X86/tests/opt/quick/se/60.gpu-randomtest/x86/linux/gpu-randomtest-ruby-GPU_RfO -re /z/atgutier/gem5/gem5-commit/tests/run.py build/HSAIL_X86/tests/opt/quick/se/60.gpu-randomtest/x86/linux/gpu-randomtest-ruby-GPU_RfO
+
+Global frequency set at 1000000000 ticks per second
+info: Entering event queue @ 0.  Starting simulation...
+Exiting @ tick 14181 because Ruby Tester completed
diff --git a/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/stats.txt b/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/stats.txt
new file mode 100644
index 000000000..75065fd02
--- /dev/null
+++ b/tests/quick/se/60.gpu-randomtest/ref/x86/linux/gpu-randomtest-ruby-GPU_RfO/stats.txt
@@ -0,0 +1,1072 @@
+
+---------- Begin Simulation Statistics ----------
+sim_seconds                                  0.000014                       # Number of seconds simulated
+sim_ticks                                       14181                       # Number of ticks simulated
+final_tick                                      14181                       # Number of ticks from beginning of simulation (restored from checkpoints and never reset)
+sim_freq                                   1000000000                       # Frequency of simulated ticks
+host_tick_rate                                  88786                       # Simulator tick rate (ticks/s)
+host_mem_usage                                 463996                       # Number of bytes of host memory used
+host_seconds                                     0.16                       # Real time elapsed on the host
+system.voltage_domain.voltage                       1                       # Voltage in Volts
+system.clk_domain.clock                             1                       # Clock period in ticks
+system.mem_ctrls.bytes_read::dir_cntrl0         16576                       # Number of bytes read from this memory
+system.mem_ctrls.bytes_read::total              16576                       # Number of bytes read from this memory
+system.mem_ctrls.bytes_written::dir_cntrl0          576                       # Number of bytes written to this memory
+system.mem_ctrls.bytes_written::total             576                       # Number of bytes written to this memory
+system.mem_ctrls.num_reads::dir_cntrl0            259                       # Number of read requests responded to by this memory
+system.mem_ctrls.num_reads::total                 259                       # Number of read requests responded to by this memory
+system.mem_ctrls.num_writes::dir_cntrl0             9                       # Number of write requests responded to by this memory
+system.mem_ctrls.num_writes::total                  9                       # Number of write requests responded to by this memory
+system.mem_ctrls.bw_read::dir_cntrl0       1168887949                       # Total read bandwidth from this memory (bytes/s)
+system.mem_ctrls.bw_read::total            1168887949                       # Total read bandwidth from this memory (bytes/s)
+system.mem_ctrls.bw_write::dir_cntrl0        40617728                       # Write bandwidth from this memory (bytes/s)
+system.mem_ctrls.bw_write::total             40617728                       # Write bandwidth from this memory (bytes/s)
+system.mem_ctrls.bw_total::dir_cntrl0      1209505677                       # Total bandwidth to/from this memory (bytes/s)
+system.mem_ctrls.bw_total::total           1209505677                       # Total bandwidth to/from this memory (bytes/s)
+system.mem_ctrls.readReqs                         259                       # Number of read requests accepted
+system.mem_ctrls.writeReqs                          9                       # Number of write requests accepted
+system.mem_ctrls.readBursts                       259                       # Number of DRAM read bursts, including those serviced by the write queue
+system.mem_ctrls.writeBursts                        9                       # Number of DRAM write bursts, including those merged in the write queue
+system.mem_ctrls.bytesReadDRAM                  15936                       # Total number of bytes read from DRAM
+system.mem_ctrls.bytesReadWrQ                     640                       # Total number of bytes read from write queue
+system.mem_ctrls.bytesWritten                       0                       # Total number of bytes written to DRAM
+system.mem_ctrls.bytesReadSys                   16576                       # Total read bytes from the system interface side
+system.mem_ctrls.bytesWrittenSys                  576                       # Total written bytes from the system interface side
+system.mem_ctrls.servicedByWrQ                     10                       # Number of DRAM read bursts serviced by the write queue
+system.mem_ctrls.mergedWrBursts                     0                       # Number of DRAM write bursts merged with an existing one
+system.mem_ctrls.neitherReadNorWriteReqs            0                       # Number of requests that are neither read nor write
+system.mem_ctrls.perBankRdBursts::0               100                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::1                71                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::2                66                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::3                12                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::4                 0                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::5                 0                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::6                 0                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::7                 0                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::8                 0                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::9                 0                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::10                0                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::11                0                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::12                0                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::13                0                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::14                0                       # Per bank write bursts
+system.mem_ctrls.perBankRdBursts::15                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::0                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::1                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::2                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::3                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::4                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::5                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::6                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::7                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::8                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::9                 0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::10                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::11                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::12                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::13                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::14                0                       # Per bank write bursts
+system.mem_ctrls.perBankWrBursts::15                0                       # Per bank write bursts
+system.mem_ctrls.numRdRetry                         0                       # Number of times read queue was full causing retry
+system.mem_ctrls.numWrRetry                         0                       # Number of times write queue was full causing retry
+system.mem_ctrls.totGap                         13941                       # Total gap between requests
+system.mem_ctrls.readPktSize::0                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::1                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::2                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::3                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::4                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::5                     0                       # Read request sizes (log2)
+system.mem_ctrls.readPktSize::6                   259                       # Read request sizes (log2)
+system.mem_ctrls.writePktSize::0                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::1                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::2                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::3                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::4                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::5                    0                       # Write request sizes (log2)
+system.mem_ctrls.writePktSize::6                    9                       # Write request sizes (log2)
+system.mem_ctrls.rdQLenPdf::0                     214                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::1                      27                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::2                       7                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::3                       1                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::4                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::5                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::6                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::7                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::8                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::9                       0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::10                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::11                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::12                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::13                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::14                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::15                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::16                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::17                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::18                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::19                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::20                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::21                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::22                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::23                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::24                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::25                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::26                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::27                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::28                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::29                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::30                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.rdQLenPdf::31                      0                       # What read queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::0                       1                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::1                       1                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::2                       1                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::3                       1                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::4                       1                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::5                       1                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::6                       1                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::7                       1                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::8                       1                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::9                       0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::10                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::11                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::12                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::13                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::14                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::15                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::16                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::17                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::18                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::19                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::20                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::21                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::22                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::23                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::24                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::25                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::26                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::27                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::28                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::29                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::30                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::31                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::32                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::33                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::34                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::35                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::36                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::37                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::38                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::39                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::40                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::41                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::42                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::43                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::44                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::45                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::46                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::47                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::48                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::49                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::50                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::51                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::52                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::53                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::54                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::55                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::56                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::57                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::58                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::59                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::60                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::61                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::62                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.wrQLenPdf::63                      0                       # What write queue length does an incoming req see
+system.mem_ctrls.bytesPerActivate::samples           15                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::mean    913.066667                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::gmean   883.543279                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::stdev   210.139908                       # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::512-639            3     20.00%     20.00% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::896-1023            1      6.67%     26.67% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::1024-1151           11     73.33%    100.00% # Bytes accessed per row activation
+system.mem_ctrls.bytesPerActivate::total           15                       # Bytes accessed per row activation
+system.mem_ctrls.totQLat                          973                       # Total ticks spent queuing
+system.mem_ctrls.totMemAccLat                    5704                       # Total ticks spent from burst creation until serviced by the DRAM
+system.mem_ctrls.totBusLat                       1245                       # Total ticks spent in databus transfers
+system.mem_ctrls.avgQLat                         3.91                       # Average queueing delay per DRAM burst
+system.mem_ctrls.avgBusLat                       5.00                       # Average bus latency per DRAM burst
+system.mem_ctrls.avgMemAccLat                   22.91                       # Average memory access latency per DRAM burst
+system.mem_ctrls.avgRdBW                      1123.76                       # Average DRAM read bandwidth in MiByte/s
+system.mem_ctrls.avgWrBW                         0.00                       # Average achieved write bandwidth in MiByte/s
+system.mem_ctrls.avgRdBWSys                   1168.89                       # Average system read bandwidth in MiByte/s
+system.mem_ctrls.avgWrBWSys                     40.62                       # Average system write bandwidth in MiByte/s
+system.mem_ctrls.peakBW                      12800.00                       # Theoretical peak bandwidth in MiByte/s
+system.mem_ctrls.busUtil                         8.78                       # Data bus utilization in percentage
+system.mem_ctrls.busUtilRead                     8.78                       # Data bus utilization in percentage for reads
+system.mem_ctrls.busUtilWrite                    0.00                       # Data bus utilization in percentage for writes
+system.mem_ctrls.avgRdQLen                       1.17                       # Average read queue length when enqueuing
+system.mem_ctrls.avgWrQLen                       2.63                       # Average write queue length when enqueuing
+system.mem_ctrls.readRowHits                      230                       # Number of row buffer hits during reads
+system.mem_ctrls.writeRowHits                       0                       # Number of row buffer hits during writes
+system.mem_ctrls.readRowHitRate                 92.37                       # Row buffer hit rate for reads
+system.mem_ctrls.writeRowHitRate                 0.00                       # Row buffer hit rate for writes
+system.mem_ctrls.avgGap                         52.02                       # Average gap between requests
+system.mem_ctrls.pageHitRate                    89.15                       # Row buffer hit rate, read and write combined
+system.mem_ctrls_0.actEnergy                    83160                       # Energy for activate commands per rank (pJ)
+system.mem_ctrls_0.preEnergy                    46200                       # Energy for precharge commands per rank (pJ)
+system.mem_ctrls_0.readEnergy                 1872000                       # Energy for read commands per rank (pJ)
+system.mem_ctrls_0.writeEnergy                      0                       # Energy for write commands per rank (pJ)
+system.mem_ctrls_0.refreshEnergy               508560                       # Energy for refresh commands per rank (pJ)
+system.mem_ctrls_0.actBackEnergy              5437116                       # Energy for active background per rank (pJ)
+system.mem_ctrls_0.preBackEnergy                58200                       # Energy for precharge background per rank (pJ)
+system.mem_ctrls_0.totalEnergy                8005236                       # Total energy per rank (pJ)
+system.mem_ctrls_0.averagePower            994.933632                       # Core power per rank (mW)
+system.mem_ctrls_0.memoryStateTime::IDLE           83                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::REF           260                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::PRE_PDN            0                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::ACT          7717                       # Time in different power states
+system.mem_ctrls_0.memoryStateTime::ACT_PDN            0                       # Time in different power states
+system.mem_ctrls_1.actEnergy                        0                       # Energy for activate commands per rank (pJ)
+system.mem_ctrls_1.preEnergy                        0                       # Energy for precharge commands per rank (pJ)
+system.mem_ctrls_1.readEnergy                       0                       # Energy for read commands per rank (pJ)
+system.mem_ctrls_1.writeEnergy                      0                       # Energy for write commands per rank (pJ)
+system.mem_ctrls_1.refreshEnergy               508560                       # Energy for refresh commands per rank (pJ)
+system.mem_ctrls_1.actBackEnergy               168264                       # Energy for active background per rank (pJ)
+system.mem_ctrls_1.preBackEnergy              4671600                       # Energy for precharge background per rank (pJ)
+system.mem_ctrls_1.totalEnergy                5348424                       # Total energy per rank (pJ)
+system.mem_ctrls_1.averagePower            665.889442                       # Core power per rank (mW)
+system.mem_ctrls_1.memoryStateTime::IDLE         7786                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::REF           260                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::PRE_PDN            0                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::ACT             0                       # Time in different power states
+system.mem_ctrls_1.memoryStateTime::ACT_PDN            0                       # Time in different power states
+system.ruby.clk_domain.clock                        1                       # Clock period in ticks
+system.ruby.outstanding_req_hist::bucket_size            2                      
+system.ruby.outstanding_req_hist::max_bucket           19                      
+system.ruby.outstanding_req_hist::samples           63                      
+system.ruby.outstanding_req_hist::mean      12.920635                      
+system.ruby.outstanding_req_hist::gmean     11.694862                      
+system.ruby.outstanding_req_hist::stdev      4.228557                      
+system.ruby.outstanding_req_hist         |           1      1.59%      1.59% |           2      3.17%      4.76% |           2      3.17%      7.94% |           5      7.94%     15.87% |           4      6.35%     22.22% |           3      4.76%     26.98% |           5      7.94%     34.92% |          14     22.22%     57.14% |          27     42.86%    100.00% |           0      0.00%    100.00%
+system.ruby.outstanding_req_hist::total            63                      
+system.ruby.latency_hist::bucket_size            1024                      
+system.ruby.latency_hist::max_bucket            10239                      
+system.ruby.latency_hist::samples                  48                      
+system.ruby.latency_hist::mean            3351.354167                      
+system.ruby.latency_hist::gmean           1865.352879                      
+system.ruby.latency_hist::stdev           1934.275107                      
+system.ruby.latency_hist                 |          11     22.92%     22.92% |           3      6.25%     29.17% |           3      6.25%     35.42% |           7     14.58%     50.00% |          18     37.50%     87.50% |           6     12.50%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.latency_hist::total                    48                      
+system.ruby.hit_latency_hist::bucket_size         1024                      
+system.ruby.hit_latency_hist::max_bucket        10239                      
+system.ruby.hit_latency_hist::samples              42                      
+system.ruby.hit_latency_hist::mean        3684.428571                      
+system.ruby.hit_latency_hist::gmean       2778.454716                      
+system.ruby.hit_latency_hist::stdev       1783.107224                      
+system.ruby.hit_latency_hist             |           7     16.67%     16.67% |           3      7.14%     23.81% |           1      2.38%     26.19% |           7     16.67%     42.86% |          18     42.86%     85.71% |           6     14.29%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.hit_latency_hist::total                42                      
+system.ruby.miss_latency_hist::bucket_size          512                      
+system.ruby.miss_latency_hist::max_bucket         5119                      
+system.ruby.miss_latency_hist::samples              6                      
+system.ruby.miss_latency_hist::mean       1019.833333                      
+system.ruby.miss_latency_hist::gmean       114.673945                      
+system.ruby.miss_latency_hist::stdev      1281.644790                      
+system.ruby.miss_latency_hist            |           3     50.00%     50.00% |           1     16.67%     66.67% |           0      0.00%     66.67% |           0      0.00%     66.67% |           0      0.00%     66.67% |           2     33.33%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.miss_latency_hist::total                6                      
+system.ruby.L1Cache.incomplete_times                6                      
+system.cp_cntrl0.L1D0cache.demand_hits              0                       # Number of cache demand hits
+system.cp_cntrl0.L1D0cache.demand_misses           45                       # Number of cache demand misses
+system.cp_cntrl0.L1D0cache.demand_accesses           45                       # Number of cache demand accesses
+system.cp_cntrl0.L1D0cache.num_data_array_writes           43                       # number of data array writes
+system.cp_cntrl0.L1D0cache.num_tag_array_reads          154                       # number of tag array reads
+system.cp_cntrl0.L1D0cache.num_tag_array_writes           41                       # number of tag array writes
+system.cp_cntrl0.L1D1cache.demand_hits              0                       # Number of cache demand hits
+system.cp_cntrl0.L1D1cache.demand_misses           43                       # Number of cache demand misses
+system.cp_cntrl0.L1D1cache.demand_accesses           43                       # Number of cache demand accesses
+system.cp_cntrl0.L1D1cache.num_data_array_writes           41                       # number of data array writes
+system.cp_cntrl0.L1D1cache.num_tag_array_reads           73                       # number of tag array reads
+system.cp_cntrl0.L1D1cache.num_tag_array_writes           41                       # number of tag array writes
+system.cp_cntrl0.L1Icache.demand_hits               0                       # Number of cache demand hits
+system.cp_cntrl0.L1Icache.demand_misses             3                       # Number of cache demand misses
+system.cp_cntrl0.L1Icache.demand_accesses            3                       # Number of cache demand accesses
+system.cp_cntrl0.L1Icache.num_tag_array_reads            3                       # number of tag array reads
+system.cp_cntrl0.L2cache.demand_hits                0                       # Number of cache demand hits
+system.cp_cntrl0.L2cache.demand_misses             91                       # Number of cache demand misses
+system.cp_cntrl0.L2cache.demand_accesses           91                       # Number of cache demand accesses
+system.cp_cntrl0.L2cache.num_data_array_reads           81                       # number of data array reads
+system.cp_cntrl0.L2cache.num_data_array_writes           84                       # number of data array writes
+system.cp_cntrl0.L2cache.num_tag_array_reads          380                       # number of tag array reads
+system.cp_cntrl0.L2cache.num_tag_array_writes          371                       # number of tag array writes
+system.cp_cntrl0.sequencer.store_waiting_on_load            2                       # Number of times a store aliased with a pending load
+system.cp_cntrl0.sequencer.store_waiting_on_store            3                       # Number of times a store aliased with a pending store
+system.cp_cntrl0.sequencer1.store_waiting_on_load            1                       # Number of times a store aliased with a pending load
+system.cp_cntrl0.sequencer1.store_waiting_on_store            4                       # Number of times a store aliased with a pending store
+system.cp_cntrl0.fully_busy_cycles                  2                       # cycles for which number of transistions == max transitions
+system.dir_cntrl0.L3CacheMemory.demand_hits            0                       # Number of cache demand hits
+system.dir_cntrl0.L3CacheMemory.demand_misses            0                       # Number of cache demand misses
+system.dir_cntrl0.L3CacheMemory.demand_accesses            0                       # Number of cache demand accesses
+system.dir_cntrl0.L3CacheMemory.num_data_array_writes          374                       # number of data array writes
+system.dir_cntrl0.L3CacheMemory.num_tag_array_reads          378                       # number of tag array reads
+system.dir_cntrl0.L3CacheMemory.num_tag_array_writes          378                       # number of tag array writes
+system.dir_cntrl0.L3CacheMemory.num_tag_array_stalls        10169                       # number of stalls caused by tag array
+system.dir_cntrl0.L3CacheMemory.num_data_array_stalls         5502                       # number of stalls caused by data array
+system.ruby.network.ext_links00.int_node.percent_links_utilized     0.199210                      
+system.ruby.network.ext_links00.int_node.msg_count.Control::0          308                      
+system.ruby.network.ext_links00.int_node.msg_count.Request_Control::0          385                      
+system.ruby.network.ext_links00.int_node.msg_count.Response_Data::2          393                      
+system.ruby.network.ext_links00.int_node.msg_count.Response_Control::2          227                      
+system.ruby.network.ext_links00.int_node.msg_count.Writeback_Data::2           66                      
+system.ruby.network.ext_links00.int_node.msg_count.Writeback_Control::2           70                      
+system.ruby.network.ext_links00.int_node.msg_count.Unblock_Control::4          303                      
+system.ruby.network.ext_links00.int_node.msg_bytes.Control::0         2464                      
+system.ruby.network.ext_links00.int_node.msg_bytes.Request_Control::0         3080                      
+system.ruby.network.ext_links00.int_node.msg_bytes.Response_Data::2        28296                      
+system.ruby.network.ext_links00.int_node.msg_bytes.Response_Control::2         1816                      
+system.ruby.network.ext_links00.int_node.msg_bytes.Writeback_Data::2         4752                      
+system.ruby.network.ext_links00.int_node.msg_bytes.Writeback_Control::2          560                      
+system.ruby.network.ext_links00.int_node.msg_bytes.Unblock_Control::4         2424                      
+system.ruby.network.ext_links01.int_node.percent_links_utilized     0.120981                      
+system.ruby.network.ext_links01.int_node.msg_count.Control::0          227                      
+system.ruby.network.ext_links01.int_node.msg_count.Request_Control::0          153                      
+system.ruby.network.ext_links01.int_node.msg_count.Response_Data::2           95                      
+system.ruby.network.ext_links01.int_node.msg_count.Response_Control::2          217                      
+system.ruby.network.ext_links01.int_node.msg_count.Writeback_Data::2           66                      
+system.ruby.network.ext_links01.int_node.msg_count.Writeback_Control::2           70                      
+system.ruby.network.ext_links01.int_node.msg_count.Unblock_Control::4           80                      
+system.ruby.network.ext_links01.int_node.msg_bytes.Control::0         1816                      
+system.ruby.network.ext_links01.int_node.msg_bytes.Request_Control::0         1224                      
+system.ruby.network.ext_links01.int_node.msg_bytes.Response_Data::2         6840                      
+system.ruby.network.ext_links01.int_node.msg_bytes.Response_Control::2         1736                      
+system.ruby.network.ext_links01.int_node.msg_bytes.Writeback_Data::2         4752                      
+system.ruby.network.ext_links01.int_node.msg_bytes.Writeback_Control::2          560                      
+system.ruby.network.ext_links01.int_node.msg_bytes.Unblock_Control::4          640                      
+system.tcp_cntrl0.L1cache.demand_hits               0                       # Number of cache demand hits
+system.tcp_cntrl0.L1cache.demand_misses             0                       # Number of cache demand misses
+system.tcp_cntrl0.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.tcp_cntrl0.L1cache.num_data_array_reads           14                       # number of data array reads
+system.tcp_cntrl0.L1cache.num_data_array_writes          116                       # number of data array writes
+system.tcp_cntrl0.L1cache.num_tag_array_reads          314                       # number of tag array reads
+system.tcp_cntrl0.L1cache.num_tag_array_writes          305                       # number of tag array writes
+system.tcp_cntrl0.L1cache.num_tag_array_stalls           38                       # number of stalls caused by tag array
+system.tcp_cntrl0.coalescer.gpu_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl0.coalescer.gpu_tcp_ld_transfers            5                       # TCP to TCP load transfers
+system.tcp_cntrl0.coalescer.gpu_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl0.coalescer.gpu_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl0.coalescer.gpu_tcp_st_hits            9                       # stores that hit in the TCP
+system.tcp_cntrl0.coalescer.gpu_tcp_st_transfers           79                       # TCP to TCP store transfers
+system.tcp_cntrl0.coalescer.gpu_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl0.coalescer.gpu_st_misses           21                       # stores that miss in the GPU
+system.tcp_cntrl0.coalescer.cp_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl0.coalescer.cp_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl0.coalescer.cp_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl0.coalescer.cp_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl0.coalescer.cp_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl0.coalescer.cp_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl0.coalescer.cp_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl0.coalescer.cp_st_misses            0                       # stores that miss in the GPU
+system.ruby.network.ext_links02.int_node.percent_links_utilized     0.173894                      
+system.ruby.network.ext_links02.int_node.msg_count.Control::0           81                      
+system.ruby.network.ext_links02.int_node.msg_count.Control::1          814                      
+system.ruby.network.ext_links02.int_node.msg_count.Request_Control::0          232                      
+system.ruby.network.ext_links02.int_node.msg_count.Request_Control::1          846                      
+system.ruby.network.ext_links02.int_node.msg_count.Response_Data::2          298                      
+system.ruby.network.ext_links02.int_node.msg_count.Response_Data::3         1644                      
+system.ruby.network.ext_links02.int_node.msg_count.Response_Control::2           10                      
+system.ruby.network.ext_links02.int_node.msg_count.Response_Control::3            2                      
+system.ruby.network.ext_links02.int_node.msg_count.Unblock_Control::4          223                      
+system.ruby.network.ext_links02.int_node.msg_count.Unblock_Control::5          831                      
+system.ruby.network.ext_links02.int_node.msg_bytes.Control::0          648                      
+system.ruby.network.ext_links02.int_node.msg_bytes.Control::1         6512                      
+system.ruby.network.ext_links02.int_node.msg_bytes.Request_Control::0         1856                      
+system.ruby.network.ext_links02.int_node.msg_bytes.Request_Control::1         6768                      
+system.ruby.network.ext_links02.int_node.msg_bytes.Response_Data::2        21456                      
+system.ruby.network.ext_links02.int_node.msg_bytes.Response_Data::3       118368                      
+system.ruby.network.ext_links02.int_node.msg_bytes.Response_Control::2           80                      
+system.ruby.network.ext_links02.int_node.msg_bytes.Response_Control::3           16                      
+system.ruby.network.ext_links02.int_node.msg_bytes.Unblock_Control::4         1784                      
+system.ruby.network.ext_links02.int_node.msg_bytes.Unblock_Control::5         6648                      
+system.tcp_cntrl1.L1cache.demand_hits               0                       # Number of cache demand hits
+system.tcp_cntrl1.L1cache.demand_misses             0                       # Number of cache demand misses
+system.tcp_cntrl1.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.tcp_cntrl1.L1cache.num_data_array_reads           10                       # number of data array reads
+system.tcp_cntrl1.L1cache.num_data_array_writes          108                       # number of data array writes
+system.tcp_cntrl1.L1cache.num_tag_array_reads          300                       # number of tag array reads
+system.tcp_cntrl1.L1cache.num_tag_array_writes          289                       # number of tag array writes
+system.tcp_cntrl1.L1cache.num_tag_array_stalls           44                       # number of stalls caused by tag array
+system.tcp_cntrl1.coalescer.gpu_tcp_ld_hits            1                       # loads that hit in the TCP
+system.tcp_cntrl1.coalescer.gpu_tcp_ld_transfers            4                       # TCP to TCP load transfers
+system.tcp_cntrl1.coalescer.gpu_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl1.coalescer.gpu_ld_misses            1                       # loads that miss in the GPU
+system.tcp_cntrl1.coalescer.gpu_tcp_st_hits            9                       # stores that hit in the TCP
+system.tcp_cntrl1.coalescer.gpu_tcp_st_transfers           74                       # TCP to TCP store transfers
+system.tcp_cntrl1.coalescer.gpu_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl1.coalescer.gpu_st_misses           20                       # stores that miss in the GPU
+system.tcp_cntrl1.coalescer.cp_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl1.coalescer.cp_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl1.coalescer.cp_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl1.coalescer.cp_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl1.coalescer.cp_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl1.coalescer.cp_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl1.coalescer.cp_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl1.coalescer.cp_st_misses            0                       # stores that miss in the GPU
+system.tcp_cntrl2.L1cache.demand_hits               0                       # Number of cache demand hits
+system.tcp_cntrl2.L1cache.demand_misses             0                       # Number of cache demand misses
+system.tcp_cntrl2.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.tcp_cntrl2.L1cache.num_data_array_reads           19                       # number of data array reads
+system.tcp_cntrl2.L1cache.num_data_array_writes          108                       # number of data array writes
+system.tcp_cntrl2.L1cache.num_tag_array_reads          302                       # number of tag array reads
+system.tcp_cntrl2.L1cache.num_tag_array_writes          292                       # number of tag array writes
+system.tcp_cntrl2.L1cache.num_tag_array_stalls           36                       # number of stalls caused by tag array
+system.tcp_cntrl2.L1cache.num_data_array_stalls            3                       # number of stalls caused by data array
+system.tcp_cntrl2.coalescer.gpu_tcp_ld_hits            1                       # loads that hit in the TCP
+system.tcp_cntrl2.coalescer.gpu_tcp_ld_transfers            9                       # TCP to TCP load transfers
+system.tcp_cntrl2.coalescer.gpu_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl2.coalescer.gpu_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl2.coalescer.gpu_tcp_st_hits            7                       # stores that hit in the TCP
+system.tcp_cntrl2.coalescer.gpu_tcp_st_transfers           72                       # TCP to TCP store transfers
+system.tcp_cntrl2.coalescer.gpu_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl2.coalescer.gpu_st_misses           18                       # stores that miss in the GPU
+system.tcp_cntrl2.coalescer.cp_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl2.coalescer.cp_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl2.coalescer.cp_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl2.coalescer.cp_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl2.coalescer.cp_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl2.coalescer.cp_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl2.coalescer.cp_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl2.coalescer.cp_st_misses            0                       # stores that miss in the GPU
+system.tcp_cntrl3.L1cache.demand_hits               0                       # Number of cache demand hits
+system.tcp_cntrl3.L1cache.demand_misses             0                       # Number of cache demand misses
+system.tcp_cntrl3.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.tcp_cntrl3.L1cache.num_data_array_reads            7                       # number of data array reads
+system.tcp_cntrl3.L1cache.num_data_array_writes          104                       # number of data array writes
+system.tcp_cntrl3.L1cache.num_tag_array_reads          272                       # number of tag array reads
+system.tcp_cntrl3.L1cache.num_tag_array_writes          262                       # number of tag array writes
+system.tcp_cntrl3.L1cache.num_tag_array_stalls           16                       # number of stalls caused by tag array
+system.tcp_cntrl3.L1cache.num_data_array_stalls            3                       # number of stalls caused by data array
+system.tcp_cntrl3.coalescer.gpu_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl3.coalescer.gpu_tcp_ld_transfers           13                       # TCP to TCP load transfers
+system.tcp_cntrl3.coalescer.gpu_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl3.coalescer.gpu_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl3.coalescer.gpu_tcp_st_hits           10                       # stores that hit in the TCP
+system.tcp_cntrl3.coalescer.gpu_tcp_st_transfers           63                       # TCP to TCP store transfers
+system.tcp_cntrl3.coalescer.gpu_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl3.coalescer.gpu_st_misses           18                       # stores that miss in the GPU
+system.tcp_cntrl3.coalescer.cp_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl3.coalescer.cp_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl3.coalescer.cp_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl3.coalescer.cp_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl3.coalescer.cp_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl3.coalescer.cp_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl3.coalescer.cp_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl3.coalescer.cp_st_misses            0                       # stores that miss in the GPU
+system.tcp_cntrl4.L1cache.demand_hits               0                       # Number of cache demand hits
+system.tcp_cntrl4.L1cache.demand_misses             0                       # Number of cache demand misses
+system.tcp_cntrl4.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.tcp_cntrl4.L1cache.num_data_array_reads           14                       # number of data array reads
+system.tcp_cntrl4.L1cache.num_data_array_writes          115                       # number of data array writes
+system.tcp_cntrl4.L1cache.num_tag_array_reads          317                       # number of tag array reads
+system.tcp_cntrl4.L1cache.num_tag_array_writes          309                       # number of tag array writes
+system.tcp_cntrl4.L1cache.num_tag_array_stalls           29                       # number of stalls caused by tag array
+system.tcp_cntrl4.coalescer.gpu_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl4.coalescer.gpu_tcp_ld_transfers            4                       # TCP to TCP load transfers
+system.tcp_cntrl4.coalescer.gpu_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl4.coalescer.gpu_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl4.coalescer.gpu_tcp_st_hits            6                       # stores that hit in the TCP
+system.tcp_cntrl4.coalescer.gpu_tcp_st_transfers           76                       # TCP to TCP store transfers
+system.tcp_cntrl4.coalescer.gpu_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl4.coalescer.gpu_st_misses           26                       # stores that miss in the GPU
+system.tcp_cntrl4.coalescer.cp_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl4.coalescer.cp_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl4.coalescer.cp_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl4.coalescer.cp_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl4.coalescer.cp_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl4.coalescer.cp_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl4.coalescer.cp_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl4.coalescer.cp_st_misses            0                       # stores that miss in the GPU
+system.tcp_cntrl5.L1cache.demand_hits               0                       # Number of cache demand hits
+system.tcp_cntrl5.L1cache.demand_misses             0                       # Number of cache demand misses
+system.tcp_cntrl5.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.tcp_cntrl5.L1cache.num_data_array_reads           10                       # number of data array reads
+system.tcp_cntrl5.L1cache.num_data_array_writes          107                       # number of data array writes
+system.tcp_cntrl5.L1cache.num_tag_array_reads          295                       # number of tag array reads
+system.tcp_cntrl5.L1cache.num_tag_array_writes          287                       # number of tag array writes
+system.tcp_cntrl5.L1cache.num_tag_array_stalls           31                       # number of stalls caused by tag array
+system.tcp_cntrl5.coalescer.gpu_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl5.coalescer.gpu_tcp_ld_transfers            6                       # TCP to TCP load transfers
+system.tcp_cntrl5.coalescer.gpu_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl5.coalescer.gpu_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl5.coalescer.gpu_tcp_st_hits            8                       # stores that hit in the TCP
+system.tcp_cntrl5.coalescer.gpu_tcp_st_transfers           69                       # TCP to TCP store transfers
+system.tcp_cntrl5.coalescer.gpu_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl5.coalescer.gpu_st_misses           23                       # stores that miss in the GPU
+system.tcp_cntrl5.coalescer.cp_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl5.coalescer.cp_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl5.coalescer.cp_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl5.coalescer.cp_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl5.coalescer.cp_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl5.coalescer.cp_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl5.coalescer.cp_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl5.coalescer.cp_st_misses            0                       # stores that miss in the GPU
+system.tcp_cntrl6.L1cache.demand_hits               0                       # Number of cache demand hits
+system.tcp_cntrl6.L1cache.demand_misses             0                       # Number of cache demand misses
+system.tcp_cntrl6.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.tcp_cntrl6.L1cache.num_data_array_reads           13                       # number of data array reads
+system.tcp_cntrl6.L1cache.num_data_array_writes          123                       # number of data array writes
+system.tcp_cntrl6.L1cache.num_tag_array_reads          342                       # number of tag array reads
+system.tcp_cntrl6.L1cache.num_tag_array_writes          335                       # number of tag array writes
+system.tcp_cntrl6.L1cache.num_tag_array_stalls           49                       # number of stalls caused by tag array
+system.tcp_cntrl6.coalescer.gpu_tcp_ld_hits            1                       # loads that hit in the TCP
+system.tcp_cntrl6.coalescer.gpu_tcp_ld_transfers           11                       # TCP to TCP load transfers
+system.tcp_cntrl6.coalescer.gpu_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl6.coalescer.gpu_ld_misses            1                       # loads that miss in the GPU
+system.tcp_cntrl6.coalescer.gpu_tcp_st_hits            5                       # stores that hit in the TCP
+system.tcp_cntrl6.coalescer.gpu_tcp_st_transfers           86                       # TCP to TCP store transfers
+system.tcp_cntrl6.coalescer.gpu_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl6.coalescer.gpu_st_misses           19                       # stores that miss in the GPU
+system.tcp_cntrl6.coalescer.cp_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl6.coalescer.cp_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl6.coalescer.cp_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl6.coalescer.cp_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl6.coalescer.cp_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl6.coalescer.cp_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl6.coalescer.cp_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl6.coalescer.cp_st_misses            0                       # stores that miss in the GPU
+system.tcp_cntrl7.L1cache.demand_hits               0                       # Number of cache demand hits
+system.tcp_cntrl7.L1cache.demand_misses             0                       # Number of cache demand misses
+system.tcp_cntrl7.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.tcp_cntrl7.L1cache.num_data_array_reads           10                       # number of data array reads
+system.tcp_cntrl7.L1cache.num_data_array_writes           97                       # number of data array writes
+system.tcp_cntrl7.L1cache.num_tag_array_reads          263                       # number of tag array reads
+system.tcp_cntrl7.L1cache.num_tag_array_writes          256                       # number of tag array writes
+system.tcp_cntrl7.L1cache.num_tag_array_stalls           11                       # number of stalls caused by tag array
+system.tcp_cntrl7.coalescer.gpu_tcp_ld_hits            1                       # loads that hit in the TCP
+system.tcp_cntrl7.coalescer.gpu_tcp_ld_transfers           10                       # TCP to TCP load transfers
+system.tcp_cntrl7.coalescer.gpu_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl7.coalescer.gpu_ld_misses            1                       # loads that miss in the GPU
+system.tcp_cntrl7.coalescer.gpu_tcp_st_hits            6                       # stores that hit in the TCP
+system.tcp_cntrl7.coalescer.gpu_tcp_st_transfers           63                       # TCP to TCP store transfers
+system.tcp_cntrl7.coalescer.gpu_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl7.coalescer.gpu_st_misses           16                       # stores that miss in the GPU
+system.tcp_cntrl7.coalescer.cp_tcp_ld_hits            0                       # loads that hit in the TCP
+system.tcp_cntrl7.coalescer.cp_tcp_ld_transfers            0                       # TCP to TCP load transfers
+system.tcp_cntrl7.coalescer.cp_tcc_ld_hits            0                       # loads that hit in the TCC
+system.tcp_cntrl7.coalescer.cp_ld_misses            0                       # loads that miss in the GPU
+system.tcp_cntrl7.coalescer.cp_tcp_st_hits            0                       # stores that hit in the TCP
+system.tcp_cntrl7.coalescer.cp_tcp_st_transfers            0                       # TCP to TCP store transfers
+system.tcp_cntrl7.coalescer.cp_tcc_st_hits            0                       # stores that hit in the TCC
+system.tcp_cntrl7.coalescer.cp_st_misses            0                       # stores that miss in the GPU
+system.sqc_cntrl0.L1cache.demand_hits               0                       # Number of cache demand hits
+system.sqc_cntrl0.L1cache.demand_misses             0                       # Number of cache demand misses
+system.sqc_cntrl0.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.sqc_cntrl0.L1cache.num_data_array_reads           12                       # number of data array reads
+system.sqc_cntrl0.L1cache.num_data_array_writes           12                       # number of data array writes
+system.sqc_cntrl0.L1cache.num_tag_array_reads           22                       # number of tag array reads
+system.sqc_cntrl0.L1cache.num_tag_array_writes           22                       # number of tag array writes
+system.sqc_cntrl1.L1cache.demand_hits               0                       # Number of cache demand hits
+system.sqc_cntrl1.L1cache.demand_misses             0                       # Number of cache demand misses
+system.sqc_cntrl1.L1cache.demand_accesses            0                       # Number of cache demand accesses
+system.sqc_cntrl1.L1cache.num_data_array_reads           15                       # number of data array reads
+system.sqc_cntrl1.L1cache.num_data_array_writes           15                       # number of data array writes
+system.sqc_cntrl1.L1cache.num_tag_array_reads           29                       # number of tag array reads
+system.sqc_cntrl1.L1cache.num_tag_array_writes           29                       # number of tag array writes
+system.tcc_cntrl0.L2cache.demand_hits               0                       # Number of cache demand hits
+system.tcc_cntrl0.L2cache.demand_misses             0                       # Number of cache demand misses
+system.tcc_cntrl0.L2cache.demand_accesses            0                       # Number of cache demand accesses
+system.tccdir_cntrl0.directory.demand_hits            0                       # Number of cache demand hits
+system.tccdir_cntrl0.directory.demand_misses            0                       # Number of cache demand misses
+system.tccdir_cntrl0.directory.demand_accesses            0                       # Number of cache demand accesses
+system.tccdir_cntrl0.directory.num_tag_array_reads          917                       # number of tag array reads
+system.tccdir_cntrl0.directory.num_tag_array_writes          902                       # number of tag array writes
+system.ruby.network.msg_count.Control            1430                      
+system.ruby.network.msg_count.Request_Control         1616                      
+system.ruby.network.msg_count.Response_Data         2430                      
+system.ruby.network.msg_count.Response_Control          456                      
+system.ruby.network.msg_count.Writeback_Data          132                      
+system.ruby.network.msg_count.Writeback_Control          140                      
+system.ruby.network.msg_count.Unblock_Control         1437                      
+system.ruby.network.msg_byte.Control            11440                      
+system.ruby.network.msg_byte.Request_Control        12928                      
+system.ruby.network.msg_byte.Response_Data       174960                      
+system.ruby.network.msg_byte.Response_Control         3648                      
+system.ruby.network.msg_byte.Writeback_Data         9504                      
+system.ruby.network.msg_byte.Writeback_Control         1120                      
+system.ruby.network.msg_byte.Unblock_Control        11496                      
+system.ruby.network.ext_links00.int_node.throttle0.link_utilization     0.250555                      
+system.ruby.network.ext_links00.int_node.throttle0.msg_count.Request_Control::0          385                      
+system.ruby.network.ext_links00.int_node.throttle0.msg_count.Response_Data::2           85                      
+system.ruby.network.ext_links00.int_node.throttle0.msg_count.Response_Control::2          227                      
+system.ruby.network.ext_links00.int_node.throttle0.msg_count.Writeback_Data::2           66                      
+system.ruby.network.ext_links00.int_node.throttle0.msg_count.Unblock_Control::4          303                      
+system.ruby.network.ext_links00.int_node.throttle0.msg_bytes.Request_Control::0         3080                      
+system.ruby.network.ext_links00.int_node.throttle0.msg_bytes.Response_Data::2         6120                      
+system.ruby.network.ext_links00.int_node.throttle0.msg_bytes.Response_Control::2         1816                      
+system.ruby.network.ext_links00.int_node.throttle0.msg_bytes.Writeback_Data::2         4752                      
+system.ruby.network.ext_links00.int_node.throttle0.msg_bytes.Unblock_Control::4         2424                      
+system.ruby.network.ext_links00.int_node.throttle1.link_utilization     0.113047                      
+system.ruby.network.ext_links00.int_node.throttle1.msg_count.Control::0          227                      
+system.ruby.network.ext_links00.int_node.throttle1.msg_count.Response_Data::2           81                      
+system.ruby.network.ext_links00.int_node.throttle1.msg_count.Writeback_Control::2           70                      
+system.ruby.network.ext_links00.int_node.throttle1.msg_bytes.Control::0         1816                      
+system.ruby.network.ext_links00.int_node.throttle1.msg_bytes.Response_Data::2         5832                      
+system.ruby.network.ext_links00.int_node.throttle1.msg_bytes.Writeback_Control::2          560                      
+system.ruby.network.ext_links00.int_node.throttle2.link_utilization     0.234028                      
+system.ruby.network.ext_links00.int_node.throttle2.msg_count.Control::0           81                      
+system.ruby.network.ext_links00.int_node.throttle2.msg_count.Response_Data::2          227                      
+system.ruby.network.ext_links00.int_node.throttle2.msg_bytes.Control::0          648                      
+system.ruby.network.ext_links00.int_node.throttle2.msg_bytes.Response_Data::2        16344                      
+system.ruby.network.ext_links01.int_node.throttle0.link_utilization     0.113047                      
+system.ruby.network.ext_links01.int_node.throttle0.msg_count.Control::0          227                      
+system.ruby.network.ext_links01.int_node.throttle0.msg_count.Response_Data::2           81                      
+system.ruby.network.ext_links01.int_node.throttle0.msg_count.Writeback_Control::2           70                      
+system.ruby.network.ext_links01.int_node.throttle0.msg_bytes.Control::0         1816                      
+system.ruby.network.ext_links01.int_node.throttle0.msg_bytes.Response_Data::2         5832                      
+system.ruby.network.ext_links01.int_node.throttle0.msg_bytes.Writeback_Control::2          560                      
+system.ruby.network.ext_links01.int_node.throttle1.link_utilization     0.128914                      
+system.ruby.network.ext_links01.int_node.throttle1.msg_count.Request_Control::0          153                      
+system.ruby.network.ext_links01.int_node.throttle1.msg_count.Response_Data::2           14                      
+system.ruby.network.ext_links01.int_node.throttle1.msg_count.Response_Control::2          217                      
+system.ruby.network.ext_links01.int_node.throttle1.msg_count.Writeback_Data::2           66                      
+system.ruby.network.ext_links01.int_node.throttle1.msg_count.Unblock_Control::4           80                      
+system.ruby.network.ext_links01.int_node.throttle1.msg_bytes.Request_Control::0         1224                      
+system.ruby.network.ext_links01.int_node.throttle1.msg_bytes.Response_Data::2         1008                      
+system.ruby.network.ext_links01.int_node.throttle1.msg_bytes.Response_Control::2         1736                      
+system.ruby.network.ext_links01.int_node.throttle1.msg_bytes.Writeback_Data::2         4752                      
+system.ruby.network.ext_links01.int_node.throttle1.msg_bytes.Unblock_Control::4          640                      
+system.ruby.network.ext_links02.int_node.throttle0.link_utilization     0.115361                      
+system.ruby.network.ext_links02.int_node.throttle0.msg_count.Control::1          102                      
+system.ruby.network.ext_links02.int_node.throttle0.msg_count.Response_Data::3          105                      
+system.ruby.network.ext_links02.int_node.throttle0.msg_bytes.Control::1          816                      
+system.ruby.network.ext_links02.int_node.throttle0.msg_bytes.Response_Data::3         7560                      
+system.ruby.network.ext_links02.int_node.throttle1.link_utilization     0.108750                      
+system.ruby.network.ext_links02.int_node.throttle1.msg_count.Control::1           96                      
+system.ruby.network.ext_links02.int_node.throttle1.msg_count.Response_Data::3           99                      
+system.ruby.network.ext_links02.int_node.throttle1.msg_bytes.Control::1          768                      
+system.ruby.network.ext_links02.int_node.throttle1.msg_bytes.Response_Data::3         7128                      
+system.ruby.network.ext_links02.int_node.throttle2.link_utilization     0.109742                      
+system.ruby.network.ext_links02.int_node.throttle2.msg_count.Control::1          105                      
+system.ruby.network.ext_links02.int_node.throttle2.msg_count.Response_Data::3           99                      
+system.ruby.network.ext_links02.int_node.throttle2.msg_bytes.Control::1          840                      
+system.ruby.network.ext_links02.int_node.throttle2.msg_bytes.Response_Data::3         7128                      
+system.ruby.network.ext_links02.int_node.throttle3.link_utilization     0.102690                      
+system.ruby.network.ext_links02.int_node.throttle3.msg_count.Control::1           86                      
+system.ruby.network.ext_links02.int_node.throttle3.msg_count.Response_Data::3           94                      
+system.ruby.network.ext_links02.int_node.throttle3.msg_bytes.Control::1          688                      
+system.ruby.network.ext_links02.int_node.throttle3.msg_bytes.Response_Data::3         6768                      
+system.ruby.network.ext_links02.int_node.throttle4.link_utilization     0.116573                      
+system.ruby.network.ext_links02.int_node.throttle4.msg_count.Control::1          104                      
+system.ruby.network.ext_links02.int_node.throttle4.msg_count.Response_Data::3          106                      
+system.ruby.network.ext_links02.int_node.throttle4.msg_bytes.Control::1          832                      
+system.ruby.network.ext_links02.int_node.throttle4.msg_bytes.Response_Data::3         7632                      
+system.ruby.network.ext_links02.int_node.throttle5.link_utilization     0.107759                      
+system.ruby.network.ext_links02.int_node.throttle5.msg_count.Control::1           96                      
+system.ruby.network.ext_links02.int_node.throttle5.msg_count.Response_Data::3           98                      
+system.ruby.network.ext_links02.int_node.throttle5.msg_bytes.Control::1          768                      
+system.ruby.network.ext_links02.int_node.throttle5.msg_bytes.Response_Data::3         7056                      
+system.ruby.network.ext_links02.int_node.throttle6.link_utilization     0.128473                      
+system.ruby.network.ext_links02.int_node.throttle6.msg_count.Control::1          113                      
+system.ruby.network.ext_links02.int_node.throttle6.msg_count.Response_Data::3          117                      
+system.ruby.network.ext_links02.int_node.throttle6.msg_bytes.Control::1          904                      
+system.ruby.network.ext_links02.int_node.throttle6.msg_bytes.Response_Data::3         8424                      
+system.ruby.network.ext_links02.int_node.throttle7.link_utilization     0.098944                      
+system.ruby.network.ext_links02.int_node.throttle7.msg_count.Control::1           88                      
+system.ruby.network.ext_links02.int_node.throttle7.msg_count.Response_Data::3           90                      
+system.ruby.network.ext_links02.int_node.throttle7.msg_bytes.Control::1          704                      
+system.ruby.network.ext_links02.int_node.throttle7.msg_bytes.Response_Data::3         6480                      
+system.ruby.network.ext_links02.int_node.throttle8.link_utilization            0                      
+system.ruby.network.ext_links02.int_node.throttle9.link_utilization     1.221264                      
+system.ruby.network.ext_links02.int_node.throttle9.msg_count.Control::0           81                      
+system.ruby.network.ext_links02.int_node.throttle9.msg_count.Request_Control::1          846                      
+system.ruby.network.ext_links02.int_node.throttle9.msg_count.Response_Data::2          227                      
+system.ruby.network.ext_links02.int_node.throttle9.msg_count.Response_Data::3          809                      
+system.ruby.network.ext_links02.int_node.throttle9.msg_count.Response_Control::3            2                      
+system.ruby.network.ext_links02.int_node.throttle9.msg_count.Unblock_Control::5          831                      
+system.ruby.network.ext_links02.int_node.throttle9.msg_bytes.Control::0          648                      
+system.ruby.network.ext_links02.int_node.throttle9.msg_bytes.Request_Control::1         6768                      
+system.ruby.network.ext_links02.int_node.throttle9.msg_bytes.Response_Data::2        16344                      
+system.ruby.network.ext_links02.int_node.throttle9.msg_bytes.Response_Data::3        58248                      
+system.ruby.network.ext_links02.int_node.throttle9.msg_bytes.Response_Control::3           16                      
+system.ruby.network.ext_links02.int_node.throttle9.msg_bytes.Unblock_Control::5         6648                      
+system.ruby.network.ext_links02.int_node.throttle10.link_utilization     0.013002                      
+system.ruby.network.ext_links02.int_node.throttle10.msg_count.Control::1           10                      
+system.ruby.network.ext_links02.int_node.throttle10.msg_count.Response_Data::3           12                      
+system.ruby.network.ext_links02.int_node.throttle10.msg_bytes.Control::1           80                      
+system.ruby.network.ext_links02.int_node.throttle10.msg_bytes.Response_Data::3          864                      
+system.ruby.network.ext_links02.int_node.throttle11.link_utilization     0.016417                      
+system.ruby.network.ext_links02.int_node.throttle11.msg_count.Control::1           14                      
+system.ruby.network.ext_links02.int_node.throttle11.msg_count.Response_Data::3           15                      
+system.ruby.network.ext_links02.int_node.throttle11.msg_bytes.Control::1          112                      
+system.ruby.network.ext_links02.int_node.throttle11.msg_bytes.Response_Data::3         1080                      
+system.ruby.network.ext_links02.int_node.throttle12.link_utilization     0.121642                      
+system.ruby.network.ext_links02.int_node.throttle12.msg_count.Request_Control::0          232                      
+system.ruby.network.ext_links02.int_node.throttle12.msg_count.Response_Data::2           71                      
+system.ruby.network.ext_links02.int_node.throttle12.msg_count.Response_Control::2           10                      
+system.ruby.network.ext_links02.int_node.throttle12.msg_count.Unblock_Control::4          223                      
+system.ruby.network.ext_links02.int_node.throttle12.msg_bytes.Request_Control::0         1856                      
+system.ruby.network.ext_links02.int_node.throttle12.msg_bytes.Response_Data::2         5112                      
+system.ruby.network.ext_links02.int_node.throttle12.msg_bytes.Response_Control::2           80                      
+system.ruby.network.ext_links02.int_node.throttle12.msg_bytes.Unblock_Control::4         1784                      
+system.ruby.CorePair_Controller.C0_Load_L1miss            1      0.00%      0.00%
+system.ruby.CorePair_Controller.C1_Load_L1miss            1      0.00%      0.00%
+system.ruby.CorePair_Controller.Ifetch0_L1miss            2      0.00%      0.00%
+system.ruby.CorePair_Controller.Ifetch1_L1miss            1      0.00%      0.00%
+system.ruby.CorePair_Controller.C0_Store_L1miss           45      0.00%      0.00%
+system.ruby.CorePair_Controller.C0_Store_L1hit            2      0.00%      0.00%
+system.ruby.CorePair_Controller.C1_Store_L1miss           73      0.00%      0.00%
+system.ruby.CorePair_Controller.NB_AckS             4      0.00%      0.00%
+system.ruby.CorePair_Controller.NB_AckM            77      0.00%      0.00%
+system.ruby.CorePair_Controller.NB_AckWB           70      0.00%      0.00%
+system.ruby.CorePair_Controller.L1D0_Repl           19      0.00%      0.00%
+system.ruby.CorePair_Controller.L2_Repl         36624      0.00%      0.00%
+system.ruby.CorePair_Controller.PrbInvData          223      0.00%      0.00%
+system.ruby.CorePair_Controller.PrbShrData            4      0.00%      0.00%
+system.ruby.CorePair_Controller.I.C0_Load_L1miss            1      0.00%      0.00%
+system.ruby.CorePair_Controller.I.C1_Load_L1miss            1      0.00%      0.00%
+system.ruby.CorePair_Controller.I.Ifetch0_L1miss            2      0.00%      0.00%
+system.ruby.CorePair_Controller.I.Ifetch1_L1miss            1      0.00%      0.00%
+system.ruby.CorePair_Controller.I.C0_Store_L1miss           41      0.00%      0.00%
+system.ruby.CorePair_Controller.I.C1_Store_L1miss           37      0.00%      0.00%
+system.ruby.CorePair_Controller.I.PrbInvData          209      0.00%      0.00%
+system.ruby.CorePair_Controller.I.PrbShrData            3      0.00%      0.00%
+system.ruby.CorePair_Controller.S.L2_Repl            3      0.00%      0.00%
+system.ruby.CorePair_Controller.S.PrbInvData            1      0.00%      0.00%
+system.ruby.CorePair_Controller.O.PrbInvData            1      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.C0_Store_L1hit            2      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.L2_Repl           33      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.PrbInvData            5      0.00%      0.00%
+system.ruby.CorePair_Controller.M0.PrbShrData            1      0.00%      0.00%
+system.ruby.CorePair_Controller.M1.C0_Store_L1miss            1      0.00%      0.00%
+system.ruby.CorePair_Controller.M1.L2_Repl           36      0.00%      0.00%
+system.ruby.CorePair_Controller.M1.PrbInvData            2      0.00%      0.00%
+system.ruby.CorePair_Controller.I_M0.C1_Store_L1miss            5      0.00%      0.00%
+system.ruby.CorePair_Controller.I_M0.NB_AckM           35      0.00%      0.00%
+system.ruby.CorePair_Controller.I_M0.L1D0_Repl           11      0.00%      0.00%
+system.ruby.CorePair_Controller.I_M0.L2_Repl        16208      0.00%      0.00%
+system.ruby.CorePair_Controller.I_M1.C0_Store_L1miss            3      0.00%      0.00%
+system.ruby.CorePair_Controller.I_M1.NB_AckM           34      0.00%      0.00%
+system.ruby.CorePair_Controller.I_M1.L2_Repl        14782      0.00%      0.00%
+system.ruby.CorePair_Controller.I_M0M1.NB_AckM            5      0.00%      0.00%
+system.ruby.CorePair_Controller.I_M0M1.L2_Repl         3020      0.00%      0.00%
+system.ruby.CorePair_Controller.I_M1M0.NB_AckM            3      0.00%      0.00%
+system.ruby.CorePair_Controller.I_M1M0.L2_Repl         1059      0.00%      0.00%
+system.ruby.CorePair_Controller.I_E0S.NB_AckS            1      0.00%      0.00%
+system.ruby.CorePair_Controller.I_E0S.L1D0_Repl            8      0.00%      0.00%
+system.ruby.CorePair_Controller.I_E0S.L2_Repl          493      0.00%      0.00%
+system.ruby.CorePair_Controller.I_E1S.NB_AckS            1      0.00%      0.00%
+system.ruby.CorePair_Controller.I_E1S.L2_Repl          638      0.00%      0.00%
+system.ruby.CorePair_Controller.ES_I.NB_AckWB            2      0.00%      0.00%
+system.ruby.CorePair_Controller.MO_I.NB_AckWB           64      0.00%      0.00%
+system.ruby.CorePair_Controller.MO_I.PrbInvData            5      0.00%      0.00%
+system.ruby.CorePair_Controller.S0.C1_Store_L1miss           31      0.00%      0.00%
+system.ruby.CorePair_Controller.S0.NB_AckS            1      0.00%      0.00%
+system.ruby.CorePair_Controller.S0.L2_Repl          352      0.00%      0.00%
+system.ruby.CorePair_Controller.S1.NB_AckS            1      0.00%      0.00%
+system.ruby.CorePair_Controller.I_C.NB_AckWB            4      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlkS             3      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlkM           309      0.00%      0.00%
+system.ruby.Directory_Controller.RdBlk              6      0.00%      0.00%
+system.ruby.Directory_Controller.VicDirty           68      0.00%      0.00%
+system.ruby.Directory_Controller.VicClean            2      0.00%      0.00%
+system.ruby.Directory_Controller.CPUData           66      0.00%      0.00%
+system.ruby.Directory_Controller.StaleWB            4      0.00%      0.00%
+system.ruby.Directory_Controller.CPUPrbResp          308      0.00%      0.00%
+system.ruby.Directory_Controller.ProbeAcksComplete          308      0.00%      0.00%
+system.ruby.Directory_Controller.L3Hit             49      0.00%      0.00%
+system.ruby.Directory_Controller.MemData          259      0.00%      0.00%
+system.ruby.Directory_Controller.WBAck              9      0.00%      0.00%
+system.ruby.Directory_Controller.CoreUnblock          303      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlkS            3      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlkM          300      0.00%      0.00%
+system.ruby.Directory_Controller.U.RdBlk            5      0.00%      0.00%
+system.ruby.Directory_Controller.U.VicDirty           68      0.00%      0.00%
+system.ruby.Directory_Controller.U.VicClean            2      0.00%      0.00%
+system.ruby.Directory_Controller.U.WBAck            9      0.00%      0.00%
+system.ruby.Directory_Controller.BL.RdBlkM            1      0.00%      0.00%
+system.ruby.Directory_Controller.BL.CPUData           66      0.00%      0.00%
+system.ruby.Directory_Controller.BL.StaleWB            4      0.00%      0.00%
+system.ruby.Directory_Controller.BM_M.MemData            8      0.00%      0.00%
+system.ruby.Directory_Controller.BS_PM.L3Hit            1      0.00%      0.00%
+system.ruby.Directory_Controller.BS_PM.MemData            2      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.RdBlkM            1      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.CPUPrbResp           12      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.ProbeAcksComplete            8      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.L3Hit           46      0.00%      0.00%
+system.ruby.Directory_Controller.BM_PM.MemData          246      0.00%      0.00%
+system.ruby.Directory_Controller.B_PM.L3Hit            2      0.00%      0.00%
+system.ruby.Directory_Controller.B_PM.MemData            3      0.00%      0.00%
+system.ruby.Directory_Controller.BS_Pm.CPUPrbResp            3      0.00%      0.00%
+system.ruby.Directory_Controller.BS_Pm.ProbeAcksComplete            3      0.00%      0.00%
+system.ruby.Directory_Controller.BM_Pm.RdBlkM            3      0.00%      0.00%
+system.ruby.Directory_Controller.BM_Pm.CPUPrbResp          288      0.00%      0.00%
+system.ruby.Directory_Controller.BM_Pm.ProbeAcksComplete          292      0.00%      0.00%
+system.ruby.Directory_Controller.B_Pm.CPUPrbResp            5      0.00%      0.00%
+system.ruby.Directory_Controller.B_Pm.ProbeAcksComplete            5      0.00%      0.00%
+system.ruby.Directory_Controller.B.RdBlkM            4      0.00%      0.00%
+system.ruby.Directory_Controller.B.RdBlk            1      0.00%      0.00%
+system.ruby.Directory_Controller.B.CoreUnblock          303      0.00%      0.00%
+system.ruby.LD.latency_hist::bucket_size         1024                      
+system.ruby.LD.latency_hist::max_bucket         10239                      
+system.ruby.LD.latency_hist::samples                1                      
+system.ruby.LD.latency_hist::mean                5324                      
+system.ruby.LD.latency_hist::gmean        5324.000000                      
+system.ruby.LD.latency_hist::stdev                nan                      
+system.ruby.LD.latency_hist              |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           1    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.latency_hist::total                  1                      
+system.ruby.LD.hit_latency_hist::bucket_size         1024                      
+system.ruby.LD.hit_latency_hist::max_bucket        10239                      
+system.ruby.LD.hit_latency_hist::samples            1                      
+system.ruby.LD.hit_latency_hist::mean            5324                      
+system.ruby.LD.hit_latency_hist::gmean    5324.000000                      
+system.ruby.LD.hit_latency_hist::stdev            nan                      
+system.ruby.LD.hit_latency_hist          |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           1    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.hit_latency_hist::total              1                      
+system.ruby.ST.latency_hist::bucket_size         1024                      
+system.ruby.ST.latency_hist::max_bucket         10239                      
+system.ruby.ST.latency_hist::samples               46                      
+system.ruby.ST.latency_hist::mean         3269.239130                      
+system.ruby.ST.latency_hist::gmean        1783.447677                      
+system.ruby.ST.latency_hist::stdev        1934.416354                      
+system.ruby.ST.latency_hist              |          11     23.91%     23.91% |           3      6.52%     30.43% |           3      6.52%     36.96% |           7     15.22%     52.17% |          18     39.13%     91.30% |           4      8.70%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.latency_hist::total                 46                      
+system.ruby.ST.hit_latency_hist::bucket_size         1024                      
+system.ruby.ST.hit_latency_hist::max_bucket        10239                      
+system.ruby.ST.hit_latency_hist::samples           40                      
+system.ruby.ST.hit_latency_hist::mean     3606.650000                      
+system.ruby.ST.hit_latency_hist::gmean    2691.718970                      
+system.ruby.ST.hit_latency_hist::stdev    1792.166924                      
+system.ruby.ST.hit_latency_hist          |           7     17.50%     17.50% |           3      7.50%     25.00% |           1      2.50%     27.50% |           7     17.50%     45.00% |          18     45.00%     90.00% |           4     10.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.hit_latency_hist::total             40                      
+system.ruby.ST.miss_latency_hist::bucket_size          512                      
+system.ruby.ST.miss_latency_hist::max_bucket         5119                      
+system.ruby.ST.miss_latency_hist::samples            6                      
+system.ruby.ST.miss_latency_hist::mean    1019.833333                      
+system.ruby.ST.miss_latency_hist::gmean    114.673945                      
+system.ruby.ST.miss_latency_hist::stdev   1281.644790                      
+system.ruby.ST.miss_latency_hist         |           3     50.00%     50.00% |           1     16.67%     66.67% |           0      0.00%     66.67% |           0      0.00%     66.67% |           0      0.00%     66.67% |           2     33.33%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.miss_latency_hist::total             6                      
+system.ruby.IFETCH.latency_hist::bucket_size         1024                      
+system.ruby.IFETCH.latency_hist::max_bucket        10239                      
+system.ruby.IFETCH.latency_hist::samples            1                      
+system.ruby.IFETCH.latency_hist::mean            5156                      
+system.ruby.IFETCH.latency_hist::gmean    5156.000000                      
+system.ruby.IFETCH.latency_hist::stdev            nan                      
+system.ruby.IFETCH.latency_hist          |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           1    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.latency_hist::total              1                      
+system.ruby.IFETCH.hit_latency_hist::bucket_size         1024                      
+system.ruby.IFETCH.hit_latency_hist::max_bucket        10239                      
+system.ruby.IFETCH.hit_latency_hist::samples            1                      
+system.ruby.IFETCH.hit_latency_hist::mean         5156                      
+system.ruby.IFETCH.hit_latency_hist::gmean  5156.000000                      
+system.ruby.IFETCH.hit_latency_hist::stdev          nan                      
+system.ruby.IFETCH.hit_latency_hist      |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           1    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.hit_latency_hist::total            1                      
+system.ruby.L1Cache.miss_mach_latency_hist::bucket_size          512                      
+system.ruby.L1Cache.miss_mach_latency_hist::max_bucket         5119                      
+system.ruby.L1Cache.miss_mach_latency_hist::samples            6                      
+system.ruby.L1Cache.miss_mach_latency_hist::mean  1019.833333                      
+system.ruby.L1Cache.miss_mach_latency_hist::gmean   114.673945                      
+system.ruby.L1Cache.miss_mach_latency_hist::stdev  1281.644790                      
+system.ruby.L1Cache.miss_mach_latency_hist |           3     50.00%     50.00% |           1     16.67%     66.67% |           0      0.00%     66.67% |           0      0.00%     66.67% |           0      0.00%     66.67% |           2     33.33%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.L1Cache.miss_mach_latency_hist::total            6                      
+system.ruby.Directory.hit_mach_latency_hist::bucket_size         1024                      
+system.ruby.Directory.hit_mach_latency_hist::max_bucket        10239                      
+system.ruby.Directory.hit_mach_latency_hist::samples           42                      
+system.ruby.Directory.hit_mach_latency_hist::mean  3684.428571                      
+system.ruby.Directory.hit_mach_latency_hist::gmean  2778.454716                      
+system.ruby.Directory.hit_mach_latency_hist::stdev  1783.107224                      
+system.ruby.Directory.hit_mach_latency_hist |           7     16.67%     16.67% |           3      7.14%     23.81% |           1      2.38%     26.19% |           7     16.67%     42.86% |          18     42.86%     85.71% |           6     14.29%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.Directory.hit_mach_latency_hist::total           42                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::bucket_size         1024                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::max_bucket        10239                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::samples            1                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::mean         5324                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::gmean  5324.000000                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist::stdev          nan                      
+system.ruby.LD.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           1    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.LD.Directory.hit_type_mach_latency_hist::total            1                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::bucket_size          512                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::max_bucket         5119                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::samples            6                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::mean  1019.833333                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::gmean   114.673945                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::stdev  1281.644790                      
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist |           3     50.00%     50.00% |           1     16.67%     66.67% |           0      0.00%     66.67% |           0      0.00%     66.67% |           0      0.00%     66.67% |           2     33.33%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.L1Cache.miss_type_mach_latency_hist::total            6                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::bucket_size         1024                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::max_bucket        10239                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::samples           40                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::mean  3606.650000                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::gmean  2691.718970                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist::stdev  1792.166924                      
+system.ruby.ST.Directory.hit_type_mach_latency_hist |           7     17.50%     17.50% |           3      7.50%     25.00% |           1      2.50%     27.50% |           7     17.50%     45.00% |          18     45.00%     90.00% |           4     10.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.ST.Directory.hit_type_mach_latency_hist::total           40                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::bucket_size         1024                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::max_bucket        10239                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::samples            1                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::mean         5156                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::gmean  5156.000000                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::stdev          nan                      
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           1    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.IFETCH.Directory.hit_type_mach_latency_hist::total            1                      
+system.ruby.SQC_Controller.Fetch         |          12     44.44%     44.44% |          15     55.56%    100.00%
+system.ruby.SQC_Controller.Fetch::total            27                      
+system.ruby.SQC_Controller.TCC_AckS      |          12     44.44%     44.44% |          15     55.56%    100.00%
+system.ruby.SQC_Controller.TCC_AckS::total           27                      
+system.ruby.SQC_Controller.PrbInvData    |          10     41.67%     41.67% |          14     58.33%    100.00%
+system.ruby.SQC_Controller.PrbInvData::total           24                      
+system.ruby.SQC_Controller.I.Fetch       |          12     44.44%     44.44% |          15     55.56%    100.00%
+system.ruby.SQC_Controller.I.Fetch::total           27                      
+system.ruby.SQC_Controller.S.PrbInvData  |          10     41.67%     41.67% |          14     58.33%    100.00%
+system.ruby.SQC_Controller.S.PrbInvData::total           24                      
+system.ruby.SQC_Controller.I_S.TCC_AckS  |          12     44.44%     44.44% |          15     55.56%    100.00%
+system.ruby.SQC_Controller.I_S.TCC_AckS::total           27                      
+system.ruby.TCCdir_Controller.RdBlk               174      0.00%      0.00%
+system.ruby.TCCdir_Controller.RdBlkM             2638      0.00%      0.00%
+system.ruby.TCCdir_Controller.RdBlkS              195      0.00%      0.00%
+system.ruby.TCCdir_Controller.CPUPrbResp          811      0.00%      0.00%
+system.ruby.TCCdir_Controller.ProbeAcksComplete          751      0.00%      0.00%
+system.ruby.TCCdir_Controller.CoreUnblock          829      0.00%      0.00%
+system.ruby.TCCdir_Controller.LastCoreUnblock            2      0.00%      0.00%
+system.ruby.TCCdir_Controller.NB_AckS               2      0.00%      0.00%
+system.ruby.TCCdir_Controller.NB_AckE               2      0.00%      0.00%
+system.ruby.TCCdir_Controller.NB_AckM             223      0.00%      0.00%
+system.ruby.TCCdir_Controller.PrbInvData          112      0.00%      0.00%
+system.ruby.TCCdir_Controller.PrbShrData            4      0.00%      0.00%
+system.ruby.TCCdir_Controller.I.RdBlk               3      0.00%      0.00%
+system.ruby.TCCdir_Controller.I.RdBlkM            156      0.00%      0.00%
+system.ruby.TCCdir_Controller.I.RdBlkS              1      0.00%      0.00%
+system.ruby.TCCdir_Controller.I.PrbInvData            9      0.00%      0.00%
+system.ruby.TCCdir_Controller.S.RdBlkM              2      0.00%      0.00%
+system.ruby.TCCdir_Controller.S.RdBlkS              1      0.00%      0.00%
+system.ruby.TCCdir_Controller.E.RdBlkM              1      0.00%      0.00%
+system.ruby.TCCdir_Controller.O.RdBlk               1      0.00%      0.00%
+system.ruby.TCCdir_Controller.O.RdBlkM             70      0.00%      0.00%
+system.ruby.TCCdir_Controller.O.PrbInvData            6      0.00%      0.00%
+system.ruby.TCCdir_Controller.M.RdBlk              61      0.00%      0.00%
+system.ruby.TCCdir_Controller.M.RdBlkM            521      0.00%      0.00%
+system.ruby.TCCdir_Controller.M.RdBlkS             25      0.00%      0.00%
+system.ruby.TCCdir_Controller.M.PrbInvData           59      0.00%      0.00%
+system.ruby.TCCdir_Controller.M.PrbShrData            4      0.00%      0.00%
+system.ruby.TCCdir_Controller.CP_I.RdBlk            9      0.00%      0.00%
+system.ruby.TCCdir_Controller.CP_I.RdBlkM           15      0.00%      0.00%
+system.ruby.TCCdir_Controller.CP_I.RdBlkS            7      0.00%      0.00%
+system.ruby.TCCdir_Controller.CP_I.CPUPrbResp           71      0.00%      0.00%
+system.ruby.TCCdir_Controller.CP_I.ProbeAcksComplete           65      0.00%      0.00%
+system.ruby.TCCdir_Controller.CP_O.RdBlkM            4      0.00%      0.00%
+system.ruby.TCCdir_Controller.CP_O.CPUPrbResp            4      0.00%      0.00%
+system.ruby.TCCdir_Controller.CP_O.ProbeAcksComplete            4      0.00%      0.00%
+system.ruby.TCCdir_Controller.CP_IOM.RdBlkM            5      0.00%      0.00%
+system.ruby.TCCdir_Controller.CP_IOM.CPUPrbResp            2      0.00%      0.00%
+system.ruby.TCCdir_Controller.CP_IOM.ProbeAcksComplete            2      0.00%      0.00%
+system.ruby.TCCdir_Controller.I_M.RdBlkM          897      0.00%      0.00%
+system.ruby.TCCdir_Controller.I_M.RdBlkS           30      0.00%      0.00%
+system.ruby.TCCdir_Controller.I_M.NB_AckM          156      0.00%      0.00%
+system.ruby.TCCdir_Controller.I_M.PrbInvData            1      0.00%      0.00%
+system.ruby.TCCdir_Controller.I_ES.RdBlkM           24      0.00%      0.00%
+system.ruby.TCCdir_Controller.I_ES.RdBlkS           34      0.00%      0.00%
+system.ruby.TCCdir_Controller.I_ES.NB_AckS            1      0.00%      0.00%
+system.ruby.TCCdir_Controller.I_ES.NB_AckE            2      0.00%      0.00%
+system.ruby.TCCdir_Controller.I_S.RdBlkM           11      0.00%      0.00%
+system.ruby.TCCdir_Controller.I_S.NB_AckS            1      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBS_S.RdBlkM            5      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBS_S.CPUPrbResp            1      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBS_S.ProbeAcksComplete            1      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBO_O.CPUPrbResp            1      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBO_O.ProbeAcksComplete            1      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBM_M.RdBlk           11      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBM_M.RdBlkM          104      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBM_M.RdBlkS           12      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBM_M.CPUPrbResp          520      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBM_M.ProbeAcksComplete          520      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBM_M.PrbInvData           14      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBM_O.RdBlkM           13      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBM_O.CPUPrbResp           86      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBM_O.ProbeAcksComplete           86      0.00%      0.00%
+system.ruby.TCCdir_Controller.BB_M.RdBlk           20      0.00%      0.00%
+system.ruby.TCCdir_Controller.BB_M.RdBlkM          181      0.00%      0.00%
+system.ruby.TCCdir_Controller.BB_M.RdBlkS           15      0.00%      0.00%
+system.ruby.TCCdir_Controller.BB_M.CoreUnblock          518      0.00%      0.00%
+system.ruby.TCCdir_Controller.BB_M.PrbInvData           19      0.00%      0.00%
+system.ruby.TCCdir_Controller.BB_O.RdBlkM           35      0.00%      0.00%
+system.ruby.TCCdir_Controller.BB_O.CoreUnblock           84      0.00%      0.00%
+system.ruby.TCCdir_Controller.BB_O.PrbInvData            2      0.00%      0.00%
+system.ruby.TCCdir_Controller.BB_OO.LastCoreUnblock            1      0.00%      0.00%
+system.ruby.TCCdir_Controller.BB_S.RdBlkM            9      0.00%      0.00%
+system.ruby.TCCdir_Controller.BB_S.LastCoreUnblock            1      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBS_M.RdBlk            9      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBS_M.RdBlkM           18      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBS_M.CPUPrbResp            4      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBS_M.ProbeAcksComplete            3      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBO_M.RdBlkM           20      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBO_M.CPUPrbResp          122      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBO_M.ProbeAcksComplete           69      0.00%      0.00%
+system.ruby.TCCdir_Controller.S_M.RdBlk            28      0.00%      0.00%
+system.ruby.TCCdir_Controller.S_M.RdBlkM           69      0.00%      0.00%
+system.ruby.TCCdir_Controller.S_M.NB_AckM            3      0.00%      0.00%
+system.ruby.TCCdir_Controller.O_M.RdBlk            20      0.00%      0.00%
+system.ruby.TCCdir_Controller.O_M.RdBlkM          249      0.00%      0.00%
+system.ruby.TCCdir_Controller.O_M.RdBlkS           51      0.00%      0.00%
+system.ruby.TCCdir_Controller.O_M.NB_AckM           64      0.00%      0.00%
+system.ruby.TCCdir_Controller.O_M.PrbInvData            2      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBB_S.RdBlk            3      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBB_S.RdBlkM           23      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBB_S.RdBlkS            5      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBB_S.CoreUnblock            2      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBB_M.RdBlk            9      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBB_M.RdBlkM          206      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBB_M.RdBlkS           14      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBB_M.CoreUnblock          223      0.00%      0.00%
+system.ruby.TCCdir_Controller.BBB_E.CoreUnblock            2      0.00%      0.00%
+system.ruby.TCP_Controller.Load          |           5      7.04%      7.04% |           6      8.45%     15.49% |          10     14.08%     29.58% |          13     18.31%     47.89% |           6      8.45%     56.34% |           6      8.45%     64.79% |          13     18.31%     83.10% |          12     16.90%    100.00%
+system.ruby.TCP_Controller.Load::total             71                      
+system.ruby.TCP_Controller.Store         |         109     13.39%     13.39% |         104     12.78%     26.17% |          98     12.04%     38.21% |          93     11.43%     49.63% |         109     13.39%     63.02% |         102     12.53%     75.55% |         113     13.88%     89.43% |          86     10.57%    100.00%
+system.ruby.TCP_Controller.Store::total           814                      
+system.ruby.TCP_Controller.TCC_AckS      |           5      7.94%      7.94% |           5      7.94%     15.87% |           9     14.29%     30.16% |          13     20.63%     50.79% |           4      6.35%     57.14% |           6      9.52%     66.67% |          11     17.46%     84.13% |          10     15.87%    100.00%
+system.ruby.TCP_Controller.TCC_AckS::total           63                      
+system.ruby.TCP_Controller.TCC_AckE      |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           1     50.00%     50.00% |           1     50.00%    100.00%
+system.ruby.TCP_Controller.TCC_AckE::total            2                      
+system.ruby.TCP_Controller.TCC_AckM      |         100     13.46%     13.46% |          94     12.65%     26.11% |          90     12.11%     38.22% |          81     10.90%     49.13% |         102     13.73%     62.85% |          92     12.38%     75.24% |         105     14.13%     89.37% |          79     10.63%    100.00%
+system.ruby.TCP_Controller.TCC_AckM::total          743                      
+system.ruby.TCP_Controller.PrbInvData    |          88     12.61%     12.61% |          87     12.46%     25.07% |          88     12.61%     37.68% |          79     11.32%     49.00% |          90     12.89%     61.89% |          86     12.32%     74.21% |         101     14.47%     88.68% |          79     11.32%    100.00%
+system.ruby.TCP_Controller.PrbInvData::total          698                      
+system.ruby.TCP_Controller.PrbShrData    |          14     15.22%     15.22% |           9      9.78%     25.00% |          17     18.48%     43.48% |           7      7.61%     51.09% |          14     15.22%     66.30% |          10     10.87%     77.17% |          12     13.04%     90.22% |           9      9.78%    100.00%
+system.ruby.TCP_Controller.PrbShrData::total           92                      
+system.ruby.TCP_Controller.I.Load        |           5      7.46%      7.46% |           5      7.46%     14.93% |           9     13.43%     28.36% |          13     19.40%     47.76% |           6      8.96%     56.72% |           6      8.96%     65.67% |          12     17.91%     83.58% |          11     16.42%    100.00%
+system.ruby.TCP_Controller.I.Load::total           67                      
+system.ruby.TCP_Controller.I.Store       |          98     13.26%     13.26% |          95     12.86%     26.12% |          89     12.04%     38.16% |          82     11.10%     49.26% |          99     13.40%     62.65% |          93     12.58%     75.24% |         105     14.21%     89.45% |          78     10.55%    100.00%
+system.ruby.TCP_Controller.I.Store::total          739                      
+system.ruby.TCP_Controller.I.PrbInvData  |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           1     50.00%     50.00% |           1     50.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.TCP_Controller.I.PrbInvData::total            2                      
+system.ruby.TCP_Controller.S.Store       |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           1     20.00%     20.00% |           1     20.00%     40.00% |           0      0.00%     40.00% |           2     40.00%     80.00% |           1     20.00%    100.00%
+system.ruby.TCP_Controller.S.Store::total            5                      
+system.ruby.TCP_Controller.S.PrbInvData  |           4      8.33%      8.33% |           4      8.33%     16.67% |           8     16.67%     33.33% |           9     18.75%     52.08% |           3      6.25%     58.33% |           4      8.33%     66.67% |           8     16.67%     83.33% |           8     16.67%    100.00%
+system.ruby.TCP_Controller.S.PrbInvData::total           48                      
+system.ruby.TCP_Controller.S.PrbShrData  |           0      0.00%      0.00% |           1    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.TCP_Controller.S.PrbShrData::total            1                      
+system.ruby.TCP_Controller.E.PrbInvData  |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           1    100.00%    100.00% |           0      0.00%    100.00%
+system.ruby.TCP_Controller.E.PrbInvData::total            1                      
+system.ruby.TCP_Controller.O.Store       |           2     20.00%     20.00% |           0      0.00%     20.00% |           2     20.00%     40.00% |           0      0.00%     40.00% |           3     30.00%     70.00% |           1     10.00%     80.00% |           1     10.00%     90.00% |           1     10.00%    100.00%
+system.ruby.TCP_Controller.O.Store::total           10                      
+system.ruby.TCP_Controller.O.PrbInvData  |           9     13.64%     13.64% |           7     10.61%     24.24% |          12     18.18%     42.42% |           7     10.61%     53.03% |          10     15.15%     68.18% |           5      7.58%     75.76% |          10     15.15%     90.91% |           6      9.09%    100.00%
+system.ruby.TCP_Controller.O.PrbInvData::total           66                      
+system.ruby.TCP_Controller.O.PrbShrData  |           0      0.00%      0.00% |           0      0.00%      0.00% |           1    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.TCP_Controller.O.PrbShrData::total            1                      
+system.ruby.TCP_Controller.M.Load        |           0      0.00%      0.00% |           1     25.00%     25.00% |           1     25.00%     50.00% |           0      0.00%     50.00% |           0      0.00%     50.00% |           0      0.00%     50.00% |           1     25.00%     75.00% |           1     25.00%    100.00%
+system.ruby.TCP_Controller.M.Load::total            4                      
+system.ruby.TCP_Controller.M.Store       |           9     15.00%     15.00% |           9     15.00%     30.00% |           7     11.67%     41.67% |          10     16.67%     58.33% |           6     10.00%     68.33% |           8     13.33%     81.67% |           5      8.33%     90.00% |           6     10.00%    100.00%
+system.ruby.TCP_Controller.M.Store::total           60                      
+system.ruby.TCP_Controller.M.PrbInvData  |          75     12.93%     12.93% |          76     13.10%     26.03% |          67     11.55%     37.59% |          62     10.69%     48.28% |          76     13.10%     61.38% |          77     13.28%     74.66% |          82     14.14%     88.79% |          65     11.21%    100.00%
+system.ruby.TCP_Controller.M.PrbInvData::total          580                      
+system.ruby.TCP_Controller.M.PrbShrData  |          14     15.56%     15.56% |           8      8.89%     24.44% |          16     17.78%     42.22% |           7      7.78%     50.00% |          14     15.56%     65.56% |          10     11.11%     76.67% |          12     13.33%     90.00% |           9     10.00%    100.00%
+system.ruby.TCP_Controller.M.PrbShrData::total           90                      
+system.ruby.TCP_Controller.I_M.TCC_AckM  |          98     13.42%     13.42% |          94     12.88%     26.30% |          89     12.19%     38.49% |          80     10.96%     49.45% |          98     13.42%     62.88% |          91     12.47%     75.34% |         103     14.11%     89.45% |          77     10.55%    100.00%
+system.ruby.TCP_Controller.I_M.TCC_AckM::total          730                      
+system.ruby.TCP_Controller.I_ES.TCC_AckS |           5      7.94%      7.94% |           5      7.94%     15.87% |           9     14.29%     30.16% |          13     20.63%     50.79% |           4      6.35%     57.14% |           6      9.52%     66.67% |          11     17.46%     84.13% |          10     15.87%    100.00%
+system.ruby.TCP_Controller.I_ES.TCC_AckS::total           63                      
+system.ruby.TCP_Controller.I_ES.TCC_AckE |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           1     50.00%     50.00% |           1     50.00%    100.00%
+system.ruby.TCP_Controller.I_ES.TCC_AckE::total            2                      
+system.ruby.TCP_Controller.S_M.TCC_AckM  |           0      0.00%      0.00% |           0      0.00%      0.00% |           0      0.00%      0.00% |           1     25.00%     25.00% |           1     25.00%     50.00% |           0      0.00%     50.00% |           1     25.00%     75.00% |           1     25.00%    100.00%
+system.ruby.TCP_Controller.S_M.TCC_AckM::total            4                      
+system.ruby.TCP_Controller.O_M.TCC_AckM  |           2     22.22%     22.22% |           0      0.00%     22.22% |           1     11.11%     33.33% |           0      0.00%     33.33% |           3     33.33%     66.67% |           1     11.11%     77.78% |           1     11.11%     88.89% |           1     11.11%    100.00%
+system.ruby.TCP_Controller.O_M.TCC_AckM::total            9                      
+system.ruby.TCP_Controller.O_M.PrbInvData |           0      0.00%      0.00% |           0      0.00%      0.00% |           1    100.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00% |           0      0.00%    100.00%
+system.ruby.TCP_Controller.O_M.PrbInvData::total            1                      
+
+---------- End Simulation Statistics   ----------
diff --git a/tests/quick/se/60.gpu-randomtest/test.py b/tests/quick/se/60.gpu-randomtest/test.py
new file mode 100644
index 000000000..d47bac621
--- /dev/null
+++ b/tests/quick/se/60.gpu-randomtest/test.py
@@ -0,0 +1,35 @@
+#
+#  Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Brad Beckmann
+#
+
diff --git a/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello b/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello
new file mode 100755
index 000000000..de248ee4a
--- /dev/null
+++ b/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello
diff --git a/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm b/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm
new file mode 100644
index 000000000..a4ad14488
--- /dev/null
+++ b/tests/test-progs/gpu-hello/bin/x86/linux/gpu-hello-kernel.asm
diff --git a/tests/test-progs/gpu-hello/src/gpu-hello-kernel.cl b/tests/test-progs/gpu-hello/src/gpu-hello-kernel.cl
new file mode 100755
index 000000000..1f61a6fab
--- /dev/null
+++ b/tests/test-progs/gpu-hello/src/gpu-hello-kernel.cl
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Marc Orr
+ */
+
+
+__kernel void read_kernel(size_t code_size,
+                          __global char *code_in,
+                          __global int *key_arr,
+                          __global char *msg_out,
+                          __global int *chars_decoded)
+{
+    size_t gid = get_global_id(0);
+    size_t my_idx = gid % code_size;
+    bool decode = 0;
+    __local atomic_int lcount;
+
+    if (get_local_id(0) == 0) {
+        lcount=0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // read code
+    char mycode = code_in[my_idx];
+
+    // decode
+    int my_key = key_arr[my_idx];
+    if (my_key) {
+        decode = 1;
+        for (int n = 0; n < my_key; n++) {
+            mycode++;
+        }
+    }
+
+    // write out msg
+    msg_out[gid] = mycode;
+
+    if (decode) {
+        atomic_fetch_add((atomic_int *)(&lcount), 1);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+
+    if(get_local_id(0) == 0) {
+        int _lcount = atomic_load(&lcount);
+        atomic_fetch_add((atomic_int *)chars_decoded, _lcount);
+    }
+}
diff --git a/tests/test-progs/gpu-hello/src/gpu-hello.cpp b/tests/test-progs/gpu-hello/src/gpu-hello.cpp
new file mode 100755
index 000000000..b6fff6e32
--- /dev/null
+++ b/tests/test-progs/gpu-hello/src/gpu-hello.cpp
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Marc Orr, Brad Beckmann
+ */
+
+#include <CL/cl.h>
+#include <malloc.h>
+
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <string>
+
+#define SUCCESS 0
+#define FAILURE 1
+
+// OpenCL datastructures
+cl_context       context;
+cl_device_id     *devices;
+cl_command_queue commandQueue;
+cl_program       program;
+cl_kernel        readKernel;
+
+// Application datastructures
+const int CACHE_LINE_SIZE = 64;
+size_t grid_size = 512;
+size_t work_group_size = 256;
+
+// arguments
+const int code_size = 5;
+const char *code = "hello";
+int *keys;
+char *msg;
+int chars_decoded = 0;
+
+/*
+    Setup data structures for application/algorithm
+*/
+int
+setupDataStructs()
+{
+    msg = (char *)memalign(CACHE_LINE_SIZE, (grid_size + 1) * sizeof(char));
+    if(msg == NULL) {
+        printf("%s:%d: error: %s\n", __FILE__, __LINE__,
+               "could not allocate host buffers\n");
+       exit(-1);
+    }
+    msg[grid_size] = '\0';
+
+    keys = (int *)memalign(CACHE_LINE_SIZE, code_size * sizeof(int));
+    keys[0] = 23;
+    keys[1] = 0;
+    keys[2] = 0;
+    keys[3] = 0;
+    keys[4] = 0;
+
+    return SUCCESS;
+}
+
+/* Setup OpenCL data structures */
+int
+setupOpenCL()
+{
+    cl_int status = 0;
+    size_t deviceListSize;
+
+    // 1. Get platform
+    cl_uint numPlatforms;
+    cl_platform_id platform = NULL;
+    status = clGetPlatformIDs(0, NULL, &numPlatforms);
+    if (status != CL_SUCCESS) {
+        printf("Error: Getting Platforms. (clGetPlatformsIDs)\n");
+        return FAILURE;
+    }
+
+    if (numPlatforms > 0) {
+        cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+        if (status != CL_SUCCESS) {
+            printf("Error: Getting Platform Ids. (clGetPlatformsIDs)\n");
+            return FAILURE;
+        }
+        for (int i = 0; i < numPlatforms; ++i) {
+            char pbuff[100];
+            status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR,
+                                       sizeof(pbuff), pbuff, NULL);
+            if (status != CL_SUCCESS) {
+                printf("Error: Getting Platform Info.(clGetPlatformInfo)\n");
+                return FAILURE;
+            }
+            platform = platforms[i];
+            if (!strcmp(pbuff, "Advanced Micro Devices, Inc.")) {
+                break;
+            }
+        }
+        delete platforms;
+    }
+
+    if(NULL == platform) {
+        printf("NULL platform found so Exiting Application.\n");
+        return FAILURE;
+    }
+
+    // 2. create context from platform
+    cl_context_properties cps[3] =
+        {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
+    context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL,
+                                      &status);
+    if (status != CL_SUCCESS) {
+        printf("Error: Creating Context. (clCreateContextFromType)\n");
+        return FAILURE;
+    }
+
+    // 3. Get device info
+    // 3a. Get # of devices
+    status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL,
+                              &deviceListSize);
+    if (status != CL_SUCCESS) {
+        printf("Error: Getting Context Info (1st clGetContextInfo)\n");
+        return FAILURE;
+    }
+
+    // 3b. Get the device list data
+    devices = (cl_device_id *)malloc(deviceListSize);
+    if (devices == 0) {
+        printf("Error: No devices found.\n");
+        return FAILURE;
+    }
+    status = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceListSize,
+                              devices, NULL);
+    if (status != CL_SUCCESS) {
+        printf("Error: Getting Context Info (2nd clGetContextInfo)\n");
+        return FAILURE;
+    }
+
+    // 4. Create command queue for device
+    commandQueue = clCreateCommandQueue(context, devices[0], 0, &status);
+    if (status != CL_SUCCESS) {
+        printf("Creating Command Queue. (clCreateCommandQueue)\n");
+        return FAILURE;
+    }
+
+    const char *source = "dummy text";
+
+    size_t sourceSize[] = {strlen(source)};
+
+    // 5b. Register the kernel with the runtime
+    program = clCreateProgramWithSource(context, 1, &source, sourceSize,
+                                        &status);
+    if (status != CL_SUCCESS) {
+      printf("Error: Loading kernel (clCreateProgramWithSource)\n");
+      return FAILURE;
+    }
+
+    status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
+    if (status != CL_SUCCESS) {
+        printf("Error: Building kernel (clBuildProgram)\n");
+        return FAILURE;
+    }
+
+    readKernel = clCreateKernel(program, "read_kernel", &status);
+    if (status != CL_SUCCESS) {
+        printf("Error: Creating readKernel from program. (clCreateKernel)\n");
+        return FAILURE;
+    }
+
+    return SUCCESS;
+}
+
+
+/* Run kernels */
+int
+runCLKernel(cl_kernel kernel)
+{
+    cl_int   status;
+    cl_event event;
+    size_t globalThreads[1] = {grid_size};
+    size_t localThreads[1] = {work_group_size};
+
+    // 1. Set arguments
+    // 1a. code size
+    size_t code_size = strlen(code);
+    status = clSetKernelArg(kernel, 0, sizeof(size_t), &code_size);
+    if (status != CL_SUCCESS) {
+        printf("Error: Setting kernel argument. (code_size)\n");
+        return FAILURE;
+    }
+
+    // 1b. code
+    status = clSetKernelArg(kernel, 1, sizeof(char *), (void *)&code);
+    if (status != CL_SUCCESS) {
+        printf("Error: Setting kernel argument. (code_in)\n");
+        return FAILURE;
+    }
+
+    // 1c. keys
+    printf("keys = %p, &keys = %p, keys[0] = %d\n", keys, &keys, keys[0]);
+    status = clSetKernelArg(kernel, 2, sizeof(int *), (void *)&keys);
+    if (status != CL_SUCCESS) {
+        printf("Error: Setting kernel argument. (key_arr)\n");
+        return FAILURE;
+    }
+
+    // 1d. msg
+    status = clSetKernelArg(kernel, 3, sizeof(char *), (void *)&msg);
+    if (status != CL_SUCCESS) {
+        printf("Error: Setting kernel argument. (memOut)\n");
+        return FAILURE;
+    }
+
+    // 1e. chars_decoded
+    int *chars_decoded_ptr = &chars_decoded;
+    status = clSetKernelArg(kernel, 4, sizeof(int *),
+                            (void *)&chars_decoded_ptr);
+    if (status != CL_SUCCESS) {
+        printf("Error: Setting kernel argument. (memOut)\n");
+        return FAILURE;
+    }
+
+    // 2. Launch kernel
+    status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
+                                    globalThreads, localThreads, 0, NULL,
+                                    &event);
+    if (status != CL_SUCCESS) {
+        printf("Error: Enqueue failed. (clEnqueueNDRangeKernel)\n");
+        return FAILURE;
+    }
+
+    // 3. Wait for the kernel
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        printf("Error: Waiting for kernel run to finish. (clWaitForEvents)\n");
+        return FAILURE;
+    }
+
+    // 4. Cleanup
+    status = clReleaseEvent(event);
+    if (status != CL_SUCCESS) {
+        printf("Error: Release event object. (clReleaseEvent)\n");
+        return FAILURE;
+    }
+
+    return SUCCESS;
+}
+
+
+/* Release OpenCL resources (Context, Memory etc.) */
+int
+cleanupCL()
+{
+    cl_int status;
+    status = clReleaseKernel(readKernel);
+    if (status != CL_SUCCESS) {
+        printf("Error: In clReleaseKernel \n");
+        return FAILURE;
+    }
+    status = clReleaseProgram(program);
+    if (status != CL_SUCCESS) {
+        printf("Error: In clReleaseProgram\n");
+        return FAILURE;
+    }
+    status = clReleaseCommandQueue(commandQueue);
+    if (status != CL_SUCCESS) {
+        printf("Error: In clReleaseCommandQueue\n");
+        return FAILURE;
+    }
+    status = clReleaseContext(context);
+    if (status != CL_SUCCESS) {
+        printf("Error: In clReleaseContext\n");
+        return FAILURE;
+    }
+
+    return SUCCESS;
+}
+
+int
+main(int argc, char * argv[])
+{
+    // Initialize Host application
+    if (setupDataStructs() != SUCCESS) {
+        return FAILURE;
+    }
+
+    // Initialize OpenCL resources
+    if (setupOpenCL() != SUCCESS) {
+        return FAILURE;
+    }
+
+    // Run the CL program
+    if (runCLKernel(readKernel) != SUCCESS) {
+        return FAILURE;
+    }
+    printf("the gpu says:\n");
+    printf("%s\n", msg);
+
+    // Releases OpenCL resources
+    if (cleanupCL()!= SUCCESS) {
+        return FAILURE;
+    }
+
+    return SUCCESS;
+}
diff --git a/util/regress b/util/regress
index 3cb078349..ceaaf739d 100755
--- a/util/regress
+++ b/util/regress
@@ -49,7 +49,8 @@ add_option('--builds',
            'POWER,' \
            'SPARC,' \
            'X86,X86_MESI_Two_Level,' \
-           'ARM',
+           'ARM,' \
+           'HSAIL_X86',
            help="comma-separated build targets to test (default: '%default')")
 add_option('--modes',
            default='se,fs',
author	Tony Gutierrez <anthony.gutierrez@amd.com>	2016-01-19 14:28:22 -0500
committer	Tony Gutierrez <anthony.gutierrez@amd.com>	2016-01-19 14:28:22 -0500
commit	1a7d3f9fcb76a68540dd948f91413533a383bfde (patch)
tree	867510a147cd095f19499d26b7c02d27de4cae9d
parent	28e353e0403ea379d244a418e8dc8ee0b48187cf (diff)
download	gem5-1a7d3f9fcb76a68540dd948f91413533a383bfde.tar.xz