72 files changed, 17312 insertions, 0 deletions
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
new file mode 100644
index 000000000..bd95f6335
--- /dev/null
+++ b/src/gpu-compute/GPU.py
@@ -0,0 +1,310 @@
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Steve Reinhardt
+#
+
+from ClockedObject import ClockedObject
+from Device import DmaDevice
+from m5.defines import buildEnv
+from m5.params import *
+from m5.proxy import *
+from m5.SimObject import SimObject
+from MemObject import MemObject
+from Process import EmulatedDriver
+from Bridge import Bridge
+from LdsState import LdsState
+
+class PrefetchType(Enum): vals = [
+    'PF_CU',
+    'PF_PHASE',
+    'PF_WF',
+    'PF_STRIDE',
+    'PF_END',
+    ]
+
+class VectorRegisterFile(SimObject):
+    type = 'VectorRegisterFile'
+    cxx_class = 'VectorRegisterFile'
+    cxx_header = 'gpu-compute/vector_register_file.hh'
+
+    simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
+    num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
+    min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
+
+class Wavefront(SimObject):
+    type = 'Wavefront'
+    cxx_class = 'Wavefront'
+    cxx_header = 'gpu-compute/wavefront.hh'
+
+    simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
+    wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
+
+class ComputeUnit(MemObject):
+    type = 'ComputeUnit'
+    cxx_class = 'ComputeUnit'
+    cxx_header = 'gpu-compute/compute_unit.hh'
+
+    wavefronts = VectorParam.Wavefront('Number of wavefronts')
+    wfSize = Param.Int(64, 'Wavefront size (in work items)')
+    num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
+
+    spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
+                                        'latency')
+
+    dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\
+                                        'latency')
+
+    issue_period = Param.Int(4, 'number of cycles per issue period')
+    num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
+    num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
+    n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
+    mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\
+                                "Represents the pipeline to reach the TCP and "\
+                                "specified in GPU clock cycles")
+    mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\
+                                 "cu. Represents the pipeline between the TCP "\
+                                 "and cu as well as TCP data array access. "\
+                                 "Specified in GPU clock cycles")
+    system = Param.System(Parent.any, "system object")
+    cu_id = Param.Int('CU id')
+    vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\
+                                           "in bytes")
+    coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\
+                                           "in bytes")
+
+    memory_port = VectorMasterPort("Port to the memory system")
+    translation_port = VectorMasterPort('Port to the TLB hierarchy')
+    sqc_port = MasterPort("Port to the SQC (I-cache")
+    sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
+    perLaneTLB = Param.Bool(False, "enable per-lane TLB")
+    prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
+                               "(0 turns off prefetching)")
+    prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)")
+    prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\
+                                            "from last mem req in lane of "\
+                                            "CU|Phase|Wavefront")
+    execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
+    xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr.");
+    debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
+    functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
+
+    localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
+                                        "kernel end")
+
+    countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\
+                                   "and how many times")
+    global_mem_queue_size = Param.Int(256, "Number of entries in the global "
+                                      "memory pipeline's queues")
+    local_mem_queue_size = Param.Int(256, "Number of entries in the local "
+                                      "memory pipeline's queues")
+    ldsBus = Bridge() # the bridge between the CU and its LDS
+    ldsPort = MasterPort("The port that goes to the LDS")
+    localDataStore = Param.LdsState("the LDS for this CU")
+
+    vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
+                                                          "file")
+
+class Shader(ClockedObject):
+    type = 'Shader'
+    cxx_class = 'Shader'
+    cxx_header = 'gpu-compute/shader.hh'
+
+    CUs = VectorParam.ComputeUnit('Number of compute units')
+    n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
+    impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
+                                                  ruby at kernel boundaries""")
+    separate_acquire_release = Param.Bool(False,
+        """Do ld_acquire/st_release generate separate requests for the
+        acquire and release?""")
+    globalmem = Param.MemorySize('64kB', 'Memory size')
+    timing = Param.Bool(False, 'timing memory accesses')
+
+    cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
+    translation = Param.Bool(False, "address translation");
+
+class ClDriver(EmulatedDriver):
+    type = 'ClDriver'
+    cxx_header = 'gpu-compute/cl_driver.hh'
+    codefile = VectorParam.String('code file name(s)')
+
+class GpuDispatcher(DmaDevice):
+    type = 'GpuDispatcher'
+    cxx_header = 'gpu-compute/dispatcher.hh'
+    # put at 8GB line for now
+    pio_addr = Param.Addr(0x200000000, "Device Address")
+    pio_latency = Param.Latency('1ns', "Programmed IO latency")
+    shader_pointer = Param.Shader('pointer to shader')
+    translation_port = MasterPort('Port to the dispatcher TLB')
+    cpu = Param.BaseCPU("CPU to wake up on kernel completion")
+
+    cl_driver = Param.ClDriver('pointer to driver')
+
+class OpType(Enum): vals = [
+    'OT_NULL',
+    'OT_ALU',
+    'OT_SPECIAL',
+    'OT_GLOBAL_READ',
+    'OT_GLOBAL_WRITE',
+    'OT_GLOBAL_ATOMIC',
+    'OT_GLOBAL_HIST',
+    'OT_GLOBAL_LDAS',
+    'OT_SHARED_READ',
+    'OT_SHARED_WRITE',
+    'OT_SHARED_ATOMIC',
+    'OT_SHARED_HIST',
+    'OT_SHARED_LDAS',
+    'OT_PRIVATE_READ',
+    'OT_PRIVATE_WRITE',
+    'OT_PRIVATE_ATOMIC',
+    'OT_PRIVATE_HIST',
+    'OT_PRIVATE_LDAS',
+    'OT_SPILL_READ',
+    'OT_SPILL_WRITE',
+    'OT_SPILL_ATOMIC',
+    'OT_SPILL_HIST',
+    'OT_SPILL_LDAS',
+    'OT_READONLY_READ',
+    'OT_READONLY_WRITE',
+    'OT_READONLY_ATOMIC',
+    'OT_READONLY_HIST',
+    'OT_READONLY_LDAS',
+    'OT_FLAT_READ',
+    'OT_FLAT_WRITE',
+    'OT_FLAT_ATOMIC',
+    'OT_FLAT_HIST',
+    'OT_FLAT_LDAS',
+    'OT_KERN_READ',
+    'OT_BRANCH',
+
+    # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version
+    #       of the compiler.
+    'OT_SHARED_MEMFENCE',
+    'OT_GLOBAL_MEMFENCE',
+    'OT_BOTH_MEMFENCE',
+
+    'OT_BARRIER',
+    'OT_PRINT',
+    'OT_RET',
+    'OT_NOP',
+    'OT_ARG'
+    ]
+
+class MemType(Enum): vals = [
+    'M_U8',
+    'M_U16',
+    'M_U32',
+    'M_U64',
+    'M_S8',
+    'M_S16',
+    'M_S32',
+    'M_S64',
+    'M_F16',
+    'M_F32',
+    'M_F64',
+    ]
+
+class MemOpType(Enum): vals = [
+    'MO_LD',
+    'MO_ST',
+    'MO_LDAS',
+    'MO_LDA',
+    'MO_AAND',
+    'MO_AOR',
+    'MO_AXOR',
+    'MO_ACAS',
+    'MO_AEXCH',
+    'MO_AADD',
+    'MO_ASUB',
+    'MO_AINC',
+    'MO_ADEC',
+    'MO_AMAX',
+    'MO_AMIN',
+    'MO_ANRAND',
+    'MO_ANROR',
+    'MO_ANRXOR',
+    'MO_ANRCAS',
+    'MO_ANREXCH',
+    'MO_ANRADD',
+    'MO_ANRSUB',
+    'MO_ANRINC',
+    'MO_ANRDEC',
+    'MO_ANRMAX',
+    'MO_ANRMIN',
+    'MO_HAND',
+    'MO_HOR',
+    'MO_HXOR',
+    'MO_HCAS',
+    'MO_HEXCH',
+    'MO_HADD',
+    'MO_HSUB',
+    'MO_HINC',
+    'MO_HDEC',
+    'MO_HMAX',
+    'MO_HMIN',
+    'MO_UNDEF'
+    ]
+
+class StorageClassType(Enum): vals = [
+    'SC_SPILL',
+    'SC_GLOBAL',
+    'SC_SHARED',
+    'SC_PRIVATE',
+    'SC_READONLY',
+    'SC_KERNARG',
+    'SC_NONE',
+    ]
+
+class RegisterType(Enum): vals = [
+    'RT_VECTOR',
+    'RT_SCALAR',
+    'RT_CONDITION',
+    'RT_HARDWARE',
+    'RT_NONE',
+    ]
+
+class GenericMemoryOrder(Enum): vals = [
+    'MEMORY_ORDER_NONE',
+    'MEMORY_ORDER_RELAXED',
+    'MEMORY_ORDER_SC_ACQUIRE',
+    'MEMORY_ORDER_SC_RELEASE',
+    'MEMORY_ORDER_SC_ACQUIRE_RELEASE',
+    ]
+
+class GenericMemoryScope(Enum): vals = [
+    'MEMORY_SCOPE_NONE',
+    'MEMORY_SCOPE_WORKITEM',
+    'MEMORY_SCOPE_WAVEFRONT',
+    'MEMORY_SCOPE_WORKGROUP',
+    'MEMORY_SCOPE_DEVICE',
+    'MEMORY_SCOPE_SYSTEM',
+    ]
diff --git a/src/gpu-compute/LdsState.py b/src/gpu-compute/LdsState.py
new file mode 100644
index 000000000..6ea9f6427
--- /dev/null
+++ b/src/gpu-compute/LdsState.py
@@ -0,0 +1,51 @@
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Joe Gross
+#
+
+from m5.defines import buildEnv
+from m5.params import *
+from m5.proxy import *
+
+from MemObject import MemObject
+
+class LdsState(MemObject):
+    type = 'LdsState'
+    cxx_class = 'LdsState'
+    cxx_header = 'gpu-compute/lds_state.hh'
+    size = Param.Int(65536, 'the size of the LDS')
+    range = Param.AddrRange('64kB', "address space of the LDS")
+    bankConflictPenalty = Param.Int(1, 'penalty per LDS bank conflict when '\
+                                    'accessing data')
+    banks = Param.Int(32, 'Number of LDS banks')
+    cuPort = SlavePort("port that goes to the compute unit")
diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript
new file mode 100644
index 000000000..2de96df24
--- /dev/null
+++ b/src/gpu-compute/SConscript
@@ -0,0 +1,99 @@
+# -*- mode:python -*-
+
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Anthony Gutierrez
+#
+
+Import('*')
+
+if not env['BUILD_GPU']:
+    Return()
+
+SimObject('GPU.py')
+SimObject('LdsState.py')
+SimObject('X86GPUTLB.py')
+
+if env['TARGET_GPU_ISA'] == 'hsail':
+    Source('brig_object.cc')
+    Source('hsail_code.cc')
+
+Source('cl_driver.cc')
+Source('compute_unit.cc')
+Source('condition_register_state.cc')
+Source('dispatcher.cc')
+Source('exec_stage.cc')
+Source('fetch_stage.cc')
+Source('fetch_unit.cc')
+Source('global_memory_pipeline.cc')
+Source('gpu_dyn_inst.cc')
+Source('gpu_exec_context.cc')
+Source('gpu_static_inst.cc')
+Source('gpu_tlb.cc')
+Source('hsa_object.cc')
+Source('kernel_cfg.cc')
+Source('lds_state.cc')
+Source('local_memory_pipeline.cc')
+Source('of_scheduling_policy.cc')
+Source('pool_manager.cc')
+Source('rr_scheduling_policy.cc')
+Source('schedule_stage.cc')
+Source('scheduler.cc')
+Source('scoreboard_check_stage.cc')
+Source('shader.cc')
+Source('simple_pool_manager.cc')
+Source('tlb_coalescer.cc')
+Source('vector_register_file.cc')
+Source('vector_register_state.cc')
+Source('wavefront.cc')
+
+DebugFlag('BRIG')
+DebugFlag('GPUCoalescer')
+DebugFlag('GPUDisp')
+DebugFlag('GPUExec')
+DebugFlag('GPUFetch')
+DebugFlag('GPUHsailCFInfo')
+DebugFlag('GPUMem')
+DebugFlag('GPUPort')
+DebugFlag('GPUPrefetch')
+DebugFlag('GPUReg')
+DebugFlag('GPUSync')
+DebugFlag('GPUTLB')
+DebugFlag('HSALoader')
+DebugFlag('HSAIL')
+DebugFlag('HSAILObject')
+DebugFlag('Predictor')
+DebugFlag('WavefrontStack')
+
+CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
+                        'GPUMem', 'GPUPort', 'GPUSync', 'GPUTLB', 'HSAIL'])
diff --git a/src/gpu-compute/X86GPUTLB.py b/src/gpu-compute/X86GPUTLB.py
new file mode 100644
index 000000000..51f8e514e
--- /dev/null
+++ b/src/gpu-compute/X86GPUTLB.py
@@ -0,0 +1,77 @@
+#
+#  Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Lisa Hsu
+#
+
+from m5.defines import buildEnv
+from m5.params import *
+from m5.proxy import *
+
+from m5.objects.MemObject import MemObject
+
+if buildEnv['FULL_SYSTEM']:
+    class X86PagetableWalker(MemObject):
+        type = 'X86PagetableWalker'
+        cxx_class = 'X86ISA::Walker'
+        port = SlavePort("Port for the hardware table walker")
+        system = Param.System(Parent.any, "system object")
+
+class X86GPUTLB(MemObject):
+    type = 'X86GPUTLB'
+    cxx_class = 'X86ISA::GpuTLB'
+    cxx_header = 'gpu-compute/gpu_tlb.hh'
+    size = Param.Int(64, "TLB size (number of entries)")
+    assoc = Param.Int(64, "TLB associativity")
+
+    if buildEnv['FULL_SYSTEM']:
+        walker = Param.X86PagetableWalker(X86PagetableWalker(),
+                                          "page table walker")
+
+    hitLatency = Param.Int(2, "Latency of a TLB hit")
+    missLatency1 = Param.Int(5, "Latency #1 of a TLB miss")
+    missLatency2 = Param.Int(100, "Latency #2 of a TLB miss")
+    maxOutstandingReqs = Param.Int(64, "# of maximum outstanding requests")
+    slave = VectorSlavePort("Port on side closer to CPU/CU")
+    master = VectorMasterPort("Port on side closer to memory")
+    allocationPolicy = Param.Bool(True, "Allocate on an access")
+    accessDistance = Param.Bool(False, "print accessDistance stats")
+
+class TLBCoalescer(MemObject):
+    type = 'TLBCoalescer'
+    cxx_class = 'TLBCoalescer'
+    cxx_header = 'gpu-compute/tlb_coalescer.hh'
+    probesPerCycle = Param.Int(2, "Number of TLB probes per cycle")
+    coalescingWindow = Param.Int(1, "Permit coalescing across that many ticks")
+    slave = VectorSlavePort("Port on side closer to CPU/CU")
+    master = VectorMasterPort("Port on side closer to memory")
+    disableCoalescing = Param.Bool(False,"Dispable Coalescing")
diff --git a/src/gpu-compute/brig_object.cc b/src/gpu-compute/brig_object.cc
new file mode 100644
index 000000000..7cc9b7cc4
--- /dev/null
+++ b/src/gpu-compute/brig_object.cc
@@ -0,0 +1,474 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt, Anthony Gutierrez
+ */
+
+#include "gpu-compute/brig_object.hh"
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+
+#include "arch/hsail/Brig.h"
+#include "base/misc.hh"
+#include "base/trace.hh"
+#include "debug/BRIG.hh"
+#include "debug/HSAILObject.hh"
+#include "debug/HSALoader.hh"
+
+using namespace Brig;
+
+std::vector<std::function<HsaObject*(const std::string&, int, uint8_t*)>>
+    HsaObject::tryFileFuncs = { BrigObject::tryFile };
+
+extern int getBrigDataTypeBytes(BrigType16_t t);
+
+const char *BrigObject::sectionNames[] =
+{
+    "hsa_data",
+    "hsa_code",
+    "hsa_operand",
+    ".shstrtab"
+};
+
+const char *segmentNames[] =
+{
+    "none",
+    "flat",
+    "global",
+    "readonly",
+    "kernarg",
+    "group",
+    "private",
+    "spill",
+    "args"
+};
+
+const uint8_t*
+BrigObject::getSectionOffset(enum SectionIndex sec, int offs) const
+{
+    // allow offs == size for dummy end pointers
+    assert(offs <= sectionInfo[sec].size);
+
+    return sectionInfo[sec].ptr + offs;
+}
+
+const char*
+BrigObject::getString(int offs) const
+{
+    return (const char*)(getSectionOffset(DataSectionIndex, offs) + 4);
+}
+
+const BrigBase*
+BrigObject::getCodeSectionEntry(int offs) const
+{
+    return (const BrigBase*)getSectionOffset(CodeSectionIndex, offs);
+}
+
+const BrigData*
+BrigObject::getBrigBaseData(int offs) const
+{
+    return (Brig::BrigData*)(getSectionOffset(DataSectionIndex, offs));
+}
+
+const uint8_t*
+BrigObject::getData(int offs) const
+{
+    return getSectionOffset(DataSectionIndex, offs);
+}
+
+const BrigOperand*
+BrigObject::getOperand(int offs) const
+{
+    return (const BrigOperand*)getSectionOffset(OperandsSectionIndex, offs);
+}
+
+unsigned
+BrigObject::getOperandPtr(int offs, int index) const
+{
+    unsigned *op_offs = (unsigned*)(getData(offs + 4 * (index + 1)));
+
+    return *op_offs;
+}
+
+const BrigInstBase*
+BrigObject::getInst(int offs) const
+{
+    return (const BrigInstBase*)getSectionOffset(CodeSectionIndex, offs);
+}
+
+HsaCode*
+BrigObject::getKernel(const std::string &name) const
+{
+    return nullptr;
+}
+
+HsaCode*
+BrigObject::getFunction(const std::string &name) const
+{
+    for (int i = 0; i < functions.size(); ++i) {
+        if (functions[i]->name() == name) {
+            return functions[i];
+        }
+    }
+
+    return nullptr;
+}
+
+void
+BrigObject::processDirectives(const BrigBase *dirPtr, const BrigBase *endPtr,
+                              StorageMap *storageMap)
+{
+    while (dirPtr < endPtr) {
+        if (!dirPtr->byteCount) {
+            fatal("Bad directive size 0\n");
+        }
+
+        // calculate next pointer now so we can override it if needed
+        const BrigBase *nextDirPtr = brigNext(dirPtr);
+
+        DPRINTF(HSAILObject, "Code section entry kind: #%x, byte count: %d\n",
+                dirPtr->kind, dirPtr->byteCount);
+
+        switch (dirPtr->kind) {
+          case BRIG_KIND_DIRECTIVE_FUNCTION:
+            {
+                const BrigDirectiveExecutable *p M5_VAR_USED =
+                    reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr);
+
+                DPRINTF(HSAILObject,"DIRECTIVE_FUNCTION: %s offset: "
+                        "%d next: %d\n", getString(p->name),
+                        p->firstCodeBlockEntry, p->nextModuleEntry);
+
+                if (p->firstCodeBlockEntry != p->nextModuleEntry) {
+                    panic("Function calls are not fully supported yet!!: %s\n",
+                          getString(p->name));
+
+                    const char *name = getString(p->name);
+
+                    HsailCode *code_obj = nullptr;
+
+                    for (int i = 0; i < functions.size(); ++i) {
+                        if (functions[i]->name() == name) {
+                            code_obj = functions[i];
+                            break;
+                        }
+                    }
+
+                    if (!code_obj) {
+                        // create new local storage map for kernel-local symbols
+                        code_obj = new HsailCode(name, p, this,
+                                                 new StorageMap(storageMap));
+                        functions.push_back(code_obj);
+                    } else {
+                        panic("Multiple definition of Function!!: %s\n",
+                              getString(p->name));
+                    }
+
+                }
+                nextDirPtr = getCodeSectionEntry(p->nextModuleEntry);
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_KERNEL:
+            {
+                const BrigDirectiveExecutable *p =
+                    reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr);
+
+                DPRINTF(HSAILObject,"DIRECTIVE_KERNEL: %s offset: %d count: "
+                        "next: %d\n", getString(p->name),
+                        p->firstCodeBlockEntry, p->nextModuleEntry);
+
+                const char *name = getString(p->name);
+
+                if (name[0] == '&')
+                    name++;
+
+                std::string str = name;
+                char *temp;
+                int len = str.length();
+
+                if (str[len - 1] >= 'a' && str[len - 1] <= 'z') {
+                    temp = new char[str.size() + 1];
+                    std::copy(str.begin(), str.end() , temp);
+                    temp[str.size()] = '\0';
+                } else {
+                    temp = new char[str.size()];
+                    std::copy(str.begin(), str.end() - 1 , temp);
+                    temp[str.size() - 1 ] = '\0';
+                }
+
+                std::string kernel_name = temp;
+                delete[] temp;
+
+                HsailCode *code_obj = nullptr;
+
+                for (const auto &kernel : kernels) {
+                    if (kernel->name() == kernel_name) {
+                        code_obj = kernel;
+                        break;
+                    }
+                }
+
+                if (!code_obj) {
+                    // create new local storage map for kernel-local symbols
+                    code_obj = new HsailCode(kernel_name, p, this,
+                                             new StorageMap(storageMap));
+
+                    kernels.push_back(code_obj);
+                }
+
+                nextDirPtr = getCodeSectionEntry(p->nextModuleEntry);
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_VARIABLE:
+            {
+                const BrigDirectiveVariable *p =
+                    reinterpret_cast<const BrigDirectiveVariable*>(dirPtr);
+
+                uint64_t readonlySize_old =
+                    storageMap->getSize(BRIG_SEGMENT_READONLY);
+
+                StorageElement* se = storageMap->addSymbol(p, this);
+
+                DPRINTF(HSAILObject, "DIRECTIVE_VARIABLE, symbol %s\n",
+                        getString(p->name));
+
+                if (p->segment == BRIG_SEGMENT_READONLY) {
+                    // readonly memory has initialization data
+                    uint8_t* readonlyData_old = readonlyData;
+
+                    readonlyData =
+                        new uint8_t[storageMap->getSize(BRIG_SEGMENT_READONLY)];
+
+                    if (p->init) {
+                        if ((p->type == BRIG_TYPE_ROIMG) ||
+                            (p->type == BRIG_TYPE_WOIMG) ||
+                            (p->type == BRIG_TYPE_SAMP) ||
+                            (p->type == BRIG_TYPE_SIG32) ||
+                            (p->type == BRIG_TYPE_SIG64)) {
+                            panic("Read only data type not supported: %s\n",
+                                  getString(p->name));
+                        }
+
+                        const BrigOperand *brigOp = getOperand(p->init);
+                        assert(brigOp->kind ==
+                               BRIG_KIND_OPERAND_CONSTANT_BYTES);
+
+                        const Brig::BrigData *operand_data M5_VAR_USED =
+                            getBrigBaseData(((BrigOperandConstantBytes*)
+                                            brigOp)->bytes);
+
+                        assert((operand_data->byteCount / 4) > 0);
+
+                        uint8_t *symbol_data =
+                            (uint8_t*)getData(((BrigOperandConstantBytes*)
+                                              brigOp)->bytes + 4);
+
+                        // copy the old data and add the new data
+                        if (readonlySize_old > 0) {
+                            memcpy(readonlyData, readonlyData_old,
+                                   readonlySize_old);
+                        }
+
+                        memcpy(readonlyData + se->offset, symbol_data,
+                               se->size);
+
+                        delete[] readonlyData_old;
+                   }
+                }
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_LABEL:
+            {
+              const BrigDirectiveLabel M5_VAR_USED *p =
+                    reinterpret_cast<const BrigDirectiveLabel*>(dirPtr);
+
+              panic("Label directives cannot be at the module level: %s\n",
+                    getString(p->name));
+
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_COMMENT:
+            {
+              const BrigDirectiveComment M5_VAR_USED *p =
+                  reinterpret_cast<const BrigDirectiveComment*>(dirPtr);
+
+              DPRINTF(HSAILObject, "DIRECTIVE_COMMENT: %s\n",
+                      getString(p->name));
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_LOC:
+            {
+                DPRINTF(HSAILObject, "BRIG_DIRECTIVE_LOC\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_MODULE:
+            {
+                const BrigDirectiveModule M5_VAR_USED *p =
+                    reinterpret_cast<const BrigDirectiveModule*>(dirPtr);
+
+                DPRINTF(HSAILObject, "BRIG_DIRECTIVE_MODULE: %s\n",
+                        getString(p->name));
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_CONTROL:
+            {
+                DPRINTF(HSAILObject, "DIRECTIVE_CONTROL\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_PRAGMA:
+            {
+                DPRINTF(HSAILObject, "DIRECTIVE_PRAGMA\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_EXTENSION:
+            {
+                DPRINTF(HSAILObject, "DIRECTIVE_EXTENSION\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START:
+            {
+                DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_START\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END:
+            {
+                DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_END\n");
+            }
+            break;
+          default:
+            if (dirPtr->kind >= BRIG_KIND_INST_BEGIN &&
+                dirPtr->kind <= BRIG_KIND_INST_END)
+                break;
+
+            if (dirPtr->kind >= BRIG_KIND_OPERAND_BEGIN &&
+                dirPtr->kind <= BRIG_KIND_OPERAND_END)
+                break;
+
+            warn("Unknown Brig directive kind: %d\n", dirPtr->kind);
+            break;
+        }
+
+        dirPtr = nextDirPtr;
+    }
+}
+
+HsaObject*
+BrigObject::tryFile(const std::string &fname, int len, uint8_t *fileData)
+{
+    const char *brig_ident = "HSA BRIG";
+
+    if (memcmp(brig_ident, fileData, MODULE_IDENTIFICATION_LENGTH))
+        return nullptr;
+
+    return new BrigObject(fname, len, fileData);
+}
+
+BrigObject::BrigObject(const std::string &fname, int len, uint8_t *fileData)
+    : HsaObject(fname), storageMap(new StorageMap())
+{
+    const char *brig_ident = "HSA BRIG";
+    BrigModuleHeader *mod_hdr = (BrigModuleHeader*)fileData;
+
+    fatal_if(memcmp(brig_ident, mod_hdr, MODULE_IDENTIFICATION_LENGTH),
+             "%s is not a BRIG file\n", fname);
+
+    if (mod_hdr->brigMajor != BRIG_VERSION_BRIG_MAJOR ||
+        mod_hdr->brigMinor != BRIG_VERSION_BRIG_MINOR) {
+        fatal("%s: BRIG version mismatch, %d.%d != %d.%d\n",
+              fname, mod_hdr->brigMajor, mod_hdr->brigMinor,
+              BRIG_VERSION_BRIG_MAJOR, BRIG_VERSION_BRIG_MINOR);
+    }
+
+    fatal_if(mod_hdr->sectionCount != NumSectionIndices, "%s: BRIG section "
+             "count (%d) != expected value (%d)\n", fname,
+             mod_hdr->sectionCount, NumSectionIndices);
+
+    for (int i = 0; i < NumSectionIndices; ++i) {
+        sectionInfo[i].ptr = nullptr;
+    }
+
+    uint64_t *sec_idx_table = (uint64_t*)(fileData + mod_hdr->sectionIndex);
+    for (int sec_idx = 0; sec_idx < mod_hdr->sectionCount; ++sec_idx) {
+        uint8_t *sec_hdr_byte_ptr = fileData + sec_idx_table[sec_idx];
+        BrigSectionHeader *sec_hdr = (BrigSectionHeader*)sec_hdr_byte_ptr;
+
+        // It doesn't look like cprintf supports string precision values,
+        // but if this breaks, the right answer is to fix that
+        DPRINTF(HSAILObject, "found section %.*s\n", sec_hdr->nameLength,
+                sec_hdr->name);
+
+        sectionInfo[sec_idx].ptr = new uint8_t[sec_hdr->byteCount];
+        memcpy(sectionInfo[sec_idx].ptr, sec_hdr_byte_ptr, sec_hdr->byteCount);
+        sectionInfo[sec_idx].size = sec_hdr->byteCount;
+    }
+
+    BrigSectionHeader *code_hdr =
+        (BrigSectionHeader*)sectionInfo[CodeSectionIndex].ptr;
+
+    DPRINTF(HSAILObject, "Code section hdr, count: %d, hdr count: %d, "
+            "name len: %d\n", code_hdr->byteCount, code_hdr->headerByteCount,
+            code_hdr->nameLength);
+
+    // start at offset 4 to skip initial null entry (see Brig spec)
+    processDirectives(getCodeSectionEntry(code_hdr->headerByteCount),
+                      getCodeSectionEntry(sectionInfo[CodeSectionIndex].size),
+                      storageMap);
+
+    delete[] fileData;
+
+    DPRINTF(HSALoader, "BRIG object %s loaded.\n", fname);
+}
+
+BrigObject::~BrigObject()
+{
+    for (int i = 0; i < NumSectionIndices; ++i)
+        if (sectionInfo[i].ptr)
+            delete[] sectionInfo[i].ptr;
+}
diff --git a/src/gpu-compute/brig_object.hh b/src/gpu-compute/brig_object.hh
new file mode 100644
index 000000000..59a585914
--- /dev/null
+++ b/src/gpu-compute/brig_object.hh
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt, Anthony Gutierrez
+ */
+
+#ifndef __BRIG_OBJECT_HH__
+#define __BRIG_OBJECT_HH__
+
+#include <cassert>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "arch/hsail/Brig.h"
+#include "gpu-compute/hsa_object.hh"
+#include "gpu-compute/hsail_code.hh"
+
+class LabelMap;
+class StorageMap;
+
+/* @class BrigObject
+ * this class implements the BRIG loader object, and
+ * is used when the simulator directly executes HSAIL.
+ * this class is responsible for extracting all
+ * information about kernels contained in BRIG format
+ * and converts them to HsailCode objects that are
+ * usable by the simulator and emulated runtime.
+ */
+
+class BrigObject final : public HsaObject
+{
+  public:
+    enum SectionIndex
+    {
+        DataSectionIndex,
+        CodeSectionIndex,
+        OperandsSectionIndex,
+        NumSectionIndices
+    };
+
+    static const char *sectionNames[];
+
+    struct SectionInfo
+    {
+        uint8_t *ptr;
+        int size;
+    };
+
+    static HsaObject* tryFile(const std::string &fname, int len,
+                              uint8_t *fileData);
+
+    SectionInfo sectionInfo[NumSectionIndices];
+    const uint8_t *getSectionOffset(enum SectionIndex sec, int offs) const;
+
+    std::vector<HsailCode*> kernels;
+    std::vector<HsailCode*> functions;
+    std::string kern_block_name;
+
+    void processDirectives(const Brig::BrigBase *dirPtr,
+                           const Brig::BrigBase *endPtr,
+                           StorageMap *storageMap);
+
+    BrigObject(const std::string &fname, int len, uint8_t *fileData);
+    ~BrigObject();
+
+    // eventually these will need to be per-kernel not per-object-file
+    StorageMap *storageMap;
+    LabelMap *labelMap;
+
+    const char* getString(int offs) const;
+    const Brig::BrigData* getBrigBaseData(int offs) const;
+    const uint8_t* getData(int offs) const;
+    const Brig::BrigBase* getCodeSectionEntry(int offs) const;
+    const Brig::BrigOperand* getOperand(int offs) const;
+    unsigned getOperandPtr(int offs, int index) const;
+    const Brig::BrigInstBase* getInst(int offs) const;
+
+    HsaCode* getKernel(const std::string &name) const override;
+    HsaCode* getFunction(const std::string &name) const override;
+
+    int numKernels() const override { return kernels.size(); }
+
+    HsaCode* getKernel(int i) const override { return kernels[i]; }
+
+    // pointer to the current kernel/function we're processing, so elements
+    // under construction can reference it.  kinda ugly, but easier
+    // than passing it all over for the few places it's needed.
+    mutable HsailCode *currentCode;
+};
+
+// Utility function to bump Brig item pointer to next element given
+// item size in bytes.  Really just an add but with lots of casting.
+template<typename T>
+T*
+brigNext(T *ptr)
+{
+    Brig::BrigBase *base_ptr = (Brig::BrigBase*)ptr;
+    int size = base_ptr->byteCount;
+    assert(size);
+
+    return (T*)((uint8_t*)ptr + size);
+}
+
+#endif // __BRIG_OBJECT_HH__
diff --git a/src/gpu-compute/cl_driver.cc b/src/gpu-compute/cl_driver.cc
new file mode 100644
index 000000000..3b3291c03
--- /dev/null
+++ b/src/gpu-compute/cl_driver.cc
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/cl_driver.hh"
+
+#include "base/intmath.hh"
+#include "cpu/thread_context.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/hsa_code.hh"
+#include "gpu-compute/hsa_kernel_info.hh"
+#include "gpu-compute/hsa_object.hh"
+#include "params/ClDriver.hh"
+#include "sim/process.hh"
+#include "sim/syscall_emul_buf.hh"
+
+ClDriver::ClDriver(ClDriverParams *p)
+    : EmulatedDriver(p), hsaCode(0)
+{
+    for (const auto &codeFile : p->codefile)
+        codeFiles.push_back(&codeFile);
+
+    maxFuncArgsSize = 0;
+
+    for (int i = 0; i < codeFiles.size(); ++i) {
+        HsaObject *obj = HsaObject::createHsaObject(*codeFiles[i]);
+
+        for (int k = 0; k < obj->numKernels(); ++k) {
+            assert(obj->getKernel(k));
+            kernels.push_back(obj->getKernel(k));
+            kernels.back()->setReadonlyData((uint8_t*)obj->readonlyData);
+            int kern_funcargs_size = kernels.back()->funcarg_size;
+            maxFuncArgsSize = maxFuncArgsSize < kern_funcargs_size ?
+                kern_funcargs_size : maxFuncArgsSize;
+        }
+    }
+
+    int name_offs = 0;
+    int code_offs = 0;
+
+    for (int i = 0; i < kernels.size(); ++i) {
+        kernelInfo.push_back(HsaKernelInfo());
+        HsaCode *k = kernels[i];
+
+        k->generateHsaKernelInfo(&kernelInfo[i]);
+
+        kernelInfo[i].name_offs = name_offs;
+        kernelInfo[i].code_offs = code_offs;
+
+        name_offs += k->name().size() + 1;
+        code_offs += k->numInsts() * sizeof(GPUStaticInst*);
+    }
+}
+
+void
+ClDriver::handshake(GpuDispatcher *_dispatcher)
+{
+    dispatcher = _dispatcher;
+    dispatcher->setFuncargsSize(maxFuncArgsSize);
+}
+
+int
+ClDriver::open(LiveProcess *p, ThreadContext *tc, int mode, int flags)
+{
+    int fd = p->allocFD(-1, filename, 0, 0, false);
+    FDEntry *fde = p->getFDEntry(fd);
+    fde->driver = this;
+
+    return fd;
+}
+
+int
+ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req)
+{
+    int index = 2;
+    Addr buf_addr = process->getSyscallArg(tc, index);
+
+    switch (req) {
+      case HSA_GET_SIZES:
+        {
+            TypedBufferArg<HsaDriverSizes> sizes(buf_addr);
+            sizes->num_kernels = kernels.size();
+            sizes->string_table_size = 0;
+            sizes->code_size = 0;
+            sizes->readonly_size = 0;
+
+            if (kernels.size() > 0) {
+                // all kernels will share the same read-only memory
+                sizes->readonly_size =
+                    kernels[0]->getSize(HsaCode::MemorySegment::READONLY);
+                // check our assumption
+                for (int i = 1; i<kernels.size(); ++i) {
+                    assert(sizes->readonly_size ==
+                    kernels[i]->getSize(HsaCode::MemorySegment::READONLY));
+                }
+            }
+
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaCode *k = kernels[i];
+                // add one for terminating '\0'
+                sizes->string_table_size += k->name().size() + 1;
+                sizes->code_size += k->numInsts() * sizeof(GPUStaticInst*);
+            }
+
+            sizes.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_KINFO:
+        {
+            TypedBufferArg<HsaKernelInfo>
+                kinfo(buf_addr, sizeof(HsaKernelInfo) * kernels.size());
+
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaKernelInfo *ki = &kinfo[i];
+                ki->name_offs = kernelInfo[i].name_offs;
+                ki->code_offs = kernelInfo[i].code_offs;
+                ki->sRegCount = kernelInfo[i].sRegCount;
+                ki->dRegCount = kernelInfo[i].dRegCount;
+                ki->cRegCount = kernelInfo[i].cRegCount;
+                ki->static_lds_size  = kernelInfo[i].static_lds_size;
+                ki->private_mem_size = kernelInfo[i].private_mem_size;
+                ki->spill_mem_size   = kernelInfo[i].spill_mem_size;
+            }
+
+            kinfo.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_STRINGS:
+        {
+            int string_table_size = 0;
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaCode *k = kernels[i];
+                string_table_size += k->name().size() + 1;
+            }
+
+            BufferArg buf(buf_addr, string_table_size);
+            char *bufp = (char*)buf.bufferPtr();
+
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaCode *k = kernels[i];
+                const char *n = k->name().c_str();
+
+                // idiomatic string copy
+                while ((*bufp++ = *n++));
+            }
+
+            assert(bufp - (char *)buf.bufferPtr() == string_table_size);
+
+            buf.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_READONLY_DATA:
+        {
+            // we can pick any kernel --- they share the same
+            // readonly segment (this assumption is checked in GET_SIZES)
+            uint64_t size =
+                kernels.back()->getSize(HsaCode::MemorySegment::READONLY);
+            BufferArg data(buf_addr, size);
+            char *datap = (char *)data.bufferPtr();
+            memcpy(datap,
+                   kernels.back()->readonly_data,
+                   size);
+            data.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_CODE:
+        {
+            // set hsaCode pointer
+            hsaCode = buf_addr;
+            int code_size = 0;
+
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaCode *k = kernels[i];
+                code_size += k->numInsts() * sizeof(TheGpuISA::RawMachInst);
+            }
+
+            TypedBufferArg<TheGpuISA::RawMachInst> buf(buf_addr, code_size);
+            TheGpuISA::RawMachInst *bufp = buf;
+
+            int buf_idx = 0;
+
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaCode *k = kernels[i];
+
+                for (int j = 0; j < k->numInsts(); ++j) {
+                    bufp[buf_idx] = k->insts()->at(j);
+                    ++buf_idx;
+                }
+            }
+
+            buf.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_CU_CNT:
+        {
+            BufferArg buf(buf_addr, sizeof(uint32_t));
+            *((uint32_t*)buf.bufferPtr()) = dispatcher->getNumCUs();
+            buf.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_VSZ:
+        {
+            BufferArg buf(buf_addr, sizeof(uint32_t));
+            *((uint32_t*)buf.bufferPtr()) = VSZ;
+            buf.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      default:
+        fatal("ClDriver: bad ioctl %d\n", req);
+    }
+
+    return 0;
+}
+
+const char*
+ClDriver::codeOffToKernelName(uint64_t code_ptr)
+{
+    assert(hsaCode);
+    uint32_t code_offs = code_ptr - hsaCode;
+
+    for (int i = 0; i < kernels.size(); ++i) {
+        if (code_offs == kernelInfo[i].code_offs) {
+            return kernels[i]->name().c_str();
+        }
+    }
+
+    return nullptr;
+}
+
+ClDriver*
+ClDriverParams::create()
+{
+    return new ClDriver(this);
+}
diff --git a/src/gpu-compute/cl_driver.hh b/src/gpu-compute/cl_driver.hh
new file mode 100644
index 000000000..03567bab5
--- /dev/null
+++ b/src/gpu-compute/cl_driver.hh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __CL_DRIVER_HH__
+#define __CL_DRIVER_HH__
+
+#include <vector>
+
+#include "gpu-compute/hsa_kernel_info.hh"
+#include "sim/emul_driver.hh"
+
+class GpuDispatcher;
+class HsaCode;
+class LiveProcess;
+class ThreadContext;
+
+struct ClDriverParams;
+
+class ClDriver final : public EmulatedDriver
+{
+  public:
+    ClDriver(ClDriverParams *p);
+    void handshake(GpuDispatcher *_dispatcher);
+    int open(LiveProcess *p, ThreadContext *tc, int mode, int flags);
+    int ioctl(LiveProcess *p, ThreadContext *tc, unsigned req);
+    const char* codeOffToKernelName(uint64_t code_ptr);
+
+  private:
+    GpuDispatcher *dispatcher;
+
+    std::vector<const std::string*> codeFiles;
+
+    // All the kernels we know about
+    std::vector<HsaCode*> kernels;
+    std::vector<HsaCode*> functions;
+
+    std::vector<HsaKernelInfo> kernelInfo;
+
+    // maximum size necessary for function arguments
+    int maxFuncArgsSize;
+    // The host virtual address for the kernel code
+    uint64_t hsaCode;
+};
+
+#endif // __CL_DRIVER_HH__
diff --git a/src/gpu-compute/cl_event.hh b/src/gpu-compute/cl_event.hh
new file mode 100644
index 000000000..75297a2d2
--- /dev/null
+++ b/src/gpu-compute/cl_event.hh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Marc Orr
+ */
+
+#ifndef __GPU_CL_EVENT_HH__
+#define __GPU_CL_EVENT_HH__
+
+struct HsaQueueEntry;
+
+class _cl_event {
+  public:
+    _cl_event() : done(false), hsaTaskPtr(nullptr), start(0), end(0) { }
+
+    volatile bool done;
+    HsaQueueEntry *hsaTaskPtr;
+    uint64_t start;
+    uint64_t end;
+};
+
+#endif // __GPU_CL_EVENT_HH__
diff --git a/src/gpu-compute/code_enums.hh b/src/gpu-compute/code_enums.hh
new file mode 100644
index 000000000..126cf6c50
--- /dev/null
+++ b/src/gpu-compute/code_enums.hh
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __CODE_ENUMS_HH__
+#define __CODE_ENUMS_HH__
+
+#define IS_OT_GLOBAL(a) ((a)>=Enums::OT_GLOBAL_READ \
+                    && (a)<=Enums::OT_GLOBAL_LDAS)
+#define IS_OT_SHARED(a) ((a)>=Enums::OT_SHARED_READ \
+                    && (a)<=Enums::OT_SHARED_LDAS)
+#define IS_OT_PRIVATE(a) ((a)>=Enums::OT_PRIVATE_READ \
+                    && (a)<=Enums::OT_PRIVATE_LDAS)
+#define IS_OT_SPILL(a) ((a)>=Enums::OT_SPILL_READ \
+                    && (a)<=Enums::OT_SPILL_LDAS)
+#define IS_OT_READONLY(a) ((a)>=Enums::OT_READONLY_READ \
+                    && (a)<=Enums::OT_READONLY_LDAS)
+#define IS_OT_FLAT(a) ((a)>=Enums::OT_FLAT_READ && (a)<=Enums::OT_FLAT_LDAS)
+
+#define IS_OT_LDAS(a) ((a)==Enums::OT_GLOBAL_LDAS||(a)==Enums::OT_SHARED_LDAS \
+                    ||(a)==Enums::OT_PRIVATE_LDAS||(a)==Enums::OT_SPILL_LDAS \
+                    ||(a)==Enums::OT_READONLY_LDAS||(a)==Enums::OT_FLAT_LDAS)
+
+#define IS_OT_READ(a) ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SHARED_READ \
+                    ||(a)==Enums::OT_PRIVATE_READ||(a)==Enums::OT_SPILL_READ \
+                    ||(a)==Enums::OT_READONLY_READ||(a)==Enums::OT_FLAT_READ)
+
+#define IS_OT_READ_GM(a) \
+    ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SPILL_READ \
+    ||(a)==Enums::OT_READONLY_READ)
+
+#define IS_OT_READ_LM(a) ((a)==Enums::OT_SHARED_READ)
+
+#define IS_OT_READ_RM(a) ((a)==Enums::OT_READONLY_READ)
+
+#define IS_OT_READ_PM(a) ((a)==Enums::OT_PRIVATE_READ)
+
+#define IS_OT_WRITE(a) \
+    ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SHARED_WRITE \
+    ||(a)==Enums::OT_PRIVATE_WRITE||(a)==Enums::OT_SPILL_WRITE \
+    ||(a)==Enums::OT_READONLY_WRITE||(a)==Enums::OT_FLAT_WRITE)
+
+#define IS_OT_WRITE_GM(a) \
+    ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SPILL_WRITE \
+    ||(a)==Enums::OT_READONLY_WRITE)
+
+#define IS_OT_WRITE_LM(a) ((a)==Enums::OT_SHARED_WRITE)
+
+#define IS_OT_WRITE_PM(a) ((a)==Enums::OT_PRIVATE_WRITE)
+
+#define IS_OT_ATOMIC(a) ((a)==Enums::OT_GLOBAL_ATOMIC \
+                    ||(a)==Enums::OT_SHARED_ATOMIC \
+                    ||(a)==Enums::OT_PRIVATE_ATOMIC \
+                    ||(a)==Enums::OT_SPILL_ATOMIC \
+                    ||(a)==Enums::OT_READONLY_ATOMIC \
+                    ||(a)==Enums::OT_FLAT_ATOMIC)
+
+#define IS_OT_ATOMIC_GM(a) ((a)==Enums::OT_GLOBAL_ATOMIC \
+                    ||(a)==Enums::OT_SPILL_ATOMIC \
+                    ||(a)==Enums::OT_READONLY_ATOMIC \
+                    ||(a)==Enums::OT_GLOBAL_MEMFENCE \
+                    ||(a)==Enums::OT_BOTH_MEMFENCE)
+
+#define IS_OT_ATOMIC_LM(a) ((a)==Enums::OT_SHARED_ATOMIC \
+                    ||(a)==Enums::OT_SHARED_MEMFENCE \
+                    ||(a)==Enums::OT_BOTH_MEMFENCE)
+
+#define IS_OT_ATOMIC_PM(a) ((a)==Enums::OT_PRIVATE_ATOMIC)
+
+#define IS_OT_HIST(a) ((a)==Enums::OT_GLOBAL_HIST \
+                    ||(a)==Enums::OT_SHARED_HIST \
+                    ||(a)==Enums::OT_PRIVATE_HIST \
+                    ||(a)==Enums::OT_SPILL_HIST \
+                    ||(a)==Enums::OT_READONLY_HIST \
+                    ||(a)==Enums::OT_FLAT_HIST)
+
+#define IS_OT_HIST_GM(a) ((a)==Enums::OT_GLOBAL_HIST \
+                    ||(a)==Enums::OT_SPILL_HIST \
+                    ||(a)==Enums::OT_READONLY_HIST)
+
+#define IS_OT_HIST_LM(a) ((a)==Enums::OT_SHARED_HIST)
+
+#define IS_OT_HIST_PM(a) ((a)==Enums::OT_PRIVATE_HIST)
+
+#endif // __CODE_ENUMS_HH__
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
new file mode 100644
index 000000000..d3622007a
--- /dev/null
+++ b/src/gpu-compute/compute_unit.cc
@@ -0,0 +1,1817 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Anthony Gutierrez
+ */
+
+#include "gpu-compute/compute_unit.hh"
+
+#include "base/output.hh"
+#include "debug/GPUDisp.hh"
+#include "debug/GPUExec.hh"
+#include "debug/GPUFetch.hh"
+#include "debug/GPUMem.hh"
+#include "debug/GPUPort.hh"
+#include "debug/GPUPrefetch.hh"
+#include "debug/GPUSync.hh"
+#include "debug/GPUTLB.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/ndrange.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/simple_pool_manager.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/page_table.hh"
+#include "sim/process.hh"
+
+ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
+    scoreboardCheckStage(p), scheduleStage(p), execStage(p),
+    globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0),
+    cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs),
+    spBypassPipeLength(p->spbypass_pipe_length),
+    dpBypassPipeLength(p->dpbypass_pipe_length),
+    issuePeriod(p->issue_period),
+    numGlbMemUnits(p->num_global_mem_pipes),
+    numLocMemUnits(p->num_shared_mem_pipes),
+    perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
+    prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
+    xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault),
+    functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
+    countPages(p->countPages), barrier_id(0),
+    vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
+    coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
+    req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
+    resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
+    _masterId(p->system->getMasterId(name() + ".ComputeUnit")),
+    lds(*p->localDataStore), globalSeqNum(0),  wavefrontSize(p->wfSize)
+{
+    // this check will be eliminated once we have wavefront size support added
+    fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ");
+    // calculate how many cycles a vector load or store will need to transfer
+    // its data over the corresponding buses
+    numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t))
+                                / (double)vrfToCoalescerBusWidth);
+
+    numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t))
+                               / coalescerToVrfBusWidth;
+
+    lastVaddrWF.resize(numSIMDs);
+    wfList.resize(numSIMDs);
+
+    for (int j = 0; j < numSIMDs; ++j) {
+        lastVaddrWF[j].resize(p->n_wf);
+
+        for (int i = 0; i < p->n_wf; ++i) {
+            lastVaddrWF[j][i].resize(VSZ);
+
+            wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
+            wfList[j][i]->setParent(this);
+
+            for (int k = 0; k < VSZ; ++k) {
+                lastVaddrWF[j][i][k] = 0;
+            }
+        }
+    }
+
+    lastVaddrPhase.resize(numSIMDs);
+
+    for (int i = 0; i < numSIMDs; ++i) {
+        lastVaddrPhase[i] = LastVaddrWave();
+    }
+
+    lastVaddrCU = LastVaddrWave();
+
+    lds.setParent(this);
+
+    if (p->execPolicy == "OLDEST-FIRST") {
+        exec_policy = EXEC_POLICY::OLDEST;
+    } else if (p->execPolicy == "ROUND-ROBIN") {
+        exec_policy = EXEC_POLICY::RR;
+    } else {
+        fatal("Invalid WF execution policy (CU)\n");
+    }
+
+    memPort.resize(VSZ);
+
+    // resize the tlbPort vectorArray
+    int tlbPort_width = perLaneTLB ? VSZ : 1;
+    tlbPort.resize(tlbPort_width);
+
+    cuExitCallback = new CUExitCallback(this);
+    registerExitCallback(cuExitCallback);
+
+    xactCasLoadMap.clear();
+    lastExecCycle.resize(numSIMDs, 0);
+
+    for (int i = 0; i < vrf.size(); ++i) {
+        vrf[i]->setParent(this);
+    }
+
+    numVecRegsPerSimd = vrf[0]->numRegs();
+}
+
+ComputeUnit::~ComputeUnit()
+{
+    // Delete wavefront slots
+
+    for (int j = 0; j < numSIMDs; ++j)
+        for (int i = 0; i < shader->n_wf; ++i) {
+            delete wfList[j][i];
+        }
+
+    readyList.clear();
+    waveStatusList.clear();
+    dispatchList.clear();
+    vectorAluInstAvail.clear();
+    delete cuExitCallback;
+    delete ldsPort;
+}
+
+void
+ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr)
+{
+    w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
+
+    w->workgroupsz[0] = ndr->q.wgSize[0];
+    w->workgroupsz[1] = ndr->q.wgSize[1];
+    w->workgroupsz[2] = ndr->q.wgSize[2];
+    w->wg_sz = w->workgroupsz[0] * w->workgroupsz[1] * w->workgroupsz[2];
+    w->gridsz[0] = ndr->q.gdSize[0];
+    w->gridsz[1] = ndr->q.gdSize[1];
+    w->gridsz[2] = ndr->q.gdSize[2];
+    w->kernelArgs = ndr->q.args;
+    w->privSizePerItem = ndr->q.privMemPerItem;
+    w->spillSizePerItem = ndr->q.spillMemPerItem;
+    w->roBase = ndr->q.roMemStart;
+    w->roSize = ndr->q.roMemTotal;
+}
+
+void
+ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
+                        int trueWgSize[], int trueWgSizeTotal,
+                        LdsChunk *ldsChunk, uint64_t origSpillMemStart)
+{
+    wfCtx->cnt = cnt;
+
+    VectorMask init_mask;
+    init_mask.reset();
+
+    for (int k = 0; k < VSZ; ++k) {
+        if (k + cnt * VSZ < trueWgSizeTotal)
+            init_mask[k] = 1;
+    }
+
+    wfCtx->init_mask = init_mask.to_ullong();
+    wfCtx->exec_mask = init_mask.to_ullong();
+
+    for (int i = 0; i < VSZ; ++i) {
+        wfCtx->bar_cnt[i] = 0;
+    }
+
+    wfCtx->max_bar_cnt = 0;
+    wfCtx->old_barrier_cnt = 0;
+    wfCtx->barrier_cnt = 0;
+
+    wfCtx->privBase = ndr->q.privMemStart;
+    ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ;
+
+    wfCtx->spillBase = ndr->q.spillMemStart;
+    ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ;
+
+    wfCtx->pc = 0;
+    wfCtx->rpc = UINT32_MAX;
+
+    // set the wavefront context to have a pointer to this section of the LDS
+    wfCtx->ldsChunk = ldsChunk;
+
+    // WG state
+    wfCtx->wg_id = ndr->globalWgId;
+    wfCtx->barrier_id = barrier_id;
+
+    // Kernel wide state
+    wfCtx->ndr = ndr;
+}
+
+void
+ComputeUnit::updateEvents() {
+
+    if (!timestampVec.empty()) {
+        uint32_t vecSize = timestampVec.size();
+        uint32_t i = 0;
+        while (i < vecSize) {
+            if (timestampVec[i] <= shader->tick_cnt) {
+                std::pair<uint32_t, uint32_t> regInfo = regIdxVec[i];
+                vrf[regInfo.first]->markReg(regInfo.second, sizeof(uint32_t),
+                                            statusVec[i]);
+                timestampVec.erase(timestampVec.begin() + i);
+                regIdxVec.erase(regIdxVec.begin() + i);
+                statusVec.erase(statusVec.begin() + i);
+                --vecSize;
+                --i;
+            }
+            ++i;
+        }
+    }
+
+    for (int i = 0; i< numSIMDs; ++i) {
+        vrf[i]->updateEvents();
+    }
+}
+
+
+void
+ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
+                     int trueWgSizeTotal)
+{
+    static int _n_wave = 0;
+    int cnt = wfCtx->cnt;
+    NDRange *ndr = wfCtx->ndr;
+
+    // Fill in Kernel state
+    FillKernelState(w, ndr);
+
+    w->kern_id = ndr->dispatchId;
+    w->dynwaveid = cnt;
+    w->init_mask = wfCtx->init_mask;
+
+    for (int k = 0; k < VSZ; ++k) {
+        w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0];
+        w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1];
+        w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]);
+
+        w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
+            trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
+            w->workitemid[0][k];
+    }
+
+    w->old_barrier_cnt = wfCtx->old_barrier_cnt;
+    w->barrier_cnt = wfCtx->barrier_cnt;
+    w->barrier_slots = divCeil(trueWgSizeTotal, VSZ);
+
+    for (int i = 0; i < VSZ; ++i) {
+        w->bar_cnt[i] = wfCtx->bar_cnt[i];
+    }
+
+    w->max_bar_cnt = wfCtx->max_bar_cnt;
+    w->privBase = wfCtx->privBase;
+    w->spillBase = wfCtx->spillBase;
+
+    w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask);
+
+    // WG state
+    w->wg_id = wfCtx->wg_id;
+    w->dispatchid = wfCtx->ndr->dispatchId;
+    w->workgroupid[0] = w->wg_id % ndr->numWg[0];
+    w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1];
+    w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]);
+
+    w->barrier_id = wfCtx->barrier_id;
+    w->stalledAtBarrier = false;
+
+    // move this from the context into the actual wavefront
+    w->ldsChunk = wfCtx->ldsChunk;
+
+    int32_t refCount M5_VAR_USED =
+                    lds.increaseRefCounter(w->dispatchid, w->wg_id);
+    DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
+                    cu_id, w->wg_id, refCount);
+
+    w->instructionBuffer.clear();
+
+    if (w->pendingFetch)
+        w->dropFetch = true;
+
+    // is this the last wavefront in the workgroup
+    // if set the spillWidth to be the remaining work-items
+    // so that the vector access is correct
+    if ((cnt + 1) * VSZ >= trueWgSizeTotal) {
+        w->spillWidth = trueWgSizeTotal - (cnt * VSZ);
+    } else {
+        w->spillWidth = VSZ;
+    }
+
+    DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
+            "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
+
+    w->start(++_n_wave, ndr->q.code_ptr);
+}
+
+void
+ComputeUnit::StartWorkgroup(NDRange *ndr)
+{
+    // reserve the LDS capacity allocated to the work group
+    // disambiguated by the dispatch ID and workgroup ID, which should be
+    // globally unique
+    LdsChunk *ldsChunk = lds.reserveSpace(ndr->dispatchId, ndr->globalWgId,
+                                          ndr->q.ldsSize);
+
+    // Send L1 cache acquire
+    // isKernel + isAcquire = Kernel Begin
+    if (shader->impl_kern_boundary_sync) {
+        GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr,
+                                                                nullptr,
+                                                                nullptr, 0);
+
+        gpuDynInst->useContinuation = false;
+        gpuDynInst->memoryOrder = Enums::MEMORY_ORDER_SC_ACQUIRE;
+        gpuDynInst->scope = Enums::MEMORY_SCOPE_SYSTEM;
+        injectGlobalMemFence(gpuDynInst, true);
+    }
+
+    // Get true size of workgroup (after clamping to grid size)
+    int trueWgSize[3];
+    int trueWgSizeTotal = 1;
+
+    for (int d = 0; d < 3; ++d) {
+        trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
+                                 ndr->wgId[d] * ndr->q.wgSize[d]);
+
+        trueWgSizeTotal *= trueWgSize[d];
+    }
+
+    uint64_t origSpillMemStart = ndr->q.spillMemStart;
+    // calculate the number of 32-bit vector registers required by wavefront
+    int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
+    int cnt = 0;
+
+    // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
+    for (int m = 0; m < shader->n_wf * numSIMDs; ++m) {
+        Wavefront *w = wfList[m % numSIMDs][m / numSIMDs];
+        // Check if this wavefront slot is available:
+        // It must be stopped and not waiting
+        // for a release to complete S_RETURNING
+        if (w->status == Wavefront::S_STOPPED) {
+            // if we have scheduled all work items then stop
+            // scheduling wavefronts
+            if (cnt * VSZ >= trueWgSizeTotal)
+                break;
+
+            // reserve vector registers for the scheduled wavefront
+            assert(vectorRegsReserved[m % numSIMDs] <= numVecRegsPerSimd);
+            uint32_t normSize = 0;
+
+            w->startVgprIndex = vrf[m % numSIMDs]->manager->
+                                    allocateRegion(vregDemand, &normSize);
+
+            w->reservedVectorRegs = normSize;
+            vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
+
+            WFContext wfCtx;
+
+            InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal,
+                                ldsChunk, origSpillMemStart);
+
+            StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal);
+            ++cnt;
+        }
+    }
+    ++barrier_id;
+}
+
+int
+ComputeUnit::ReadyWorkgroup(NDRange *ndr)
+{
+    // Get true size of workgroup (after clamping to grid size)
+    int trueWgSize[3];
+    int trueWgSizeTotal = 1;
+
+    for (int d = 0; d < 3; ++d) {
+        trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
+                                 ndr->wgId[d] * ndr->q.wgSize[d]);
+
+        trueWgSizeTotal *= trueWgSize[d];
+        DPRINTF(GPUDisp, "trueWgSize[%d] =  %d\n", d, trueWgSize[d]);
+    }
+
+    DPRINTF(GPUDisp, "trueWgSizeTotal =  %d\n", trueWgSizeTotal);
+
+    // calculate the number of 32-bit vector registers required by each
+    // work item of the work group
+    int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
+    bool vregAvail = true;
+    int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ;
+    int freeWfSlots = 0;
+    // check if the total number of VGPRs required by all WFs of the WG
+    // fit in the VRFs of all SIMD units
+    assert((numWfs * vregDemandPerWI) <= (numSIMDs * numVecRegsPerSimd));
+    int numMappedWfs = 0;
+    std::vector<int> numWfsPerSimd;
+    numWfsPerSimd.resize(numSIMDs, 0);
+    // find how many free WF slots we have across all SIMDs
+    for (int j = 0; j < shader->n_wf; ++j) {
+        for (int i = 0; i < numSIMDs; ++i) {
+            if (wfList[i][j]->status == Wavefront::S_STOPPED) {
+                // count the number of free WF slots
+                ++freeWfSlots;
+                if (numMappedWfs < numWfs) {
+                    // count the WFs to be assigned per SIMD
+                    numWfsPerSimd[i]++;
+                }
+                numMappedWfs++;
+            }
+        }
+    }
+
+    // if there are enough free WF slots then find if there are enough
+    // free VGPRs per SIMD based on the WF->SIMD mapping
+    if (freeWfSlots >= numWfs) {
+        for (int j = 0; j < numSIMDs; ++j) {
+            // find if there are enough free VGPR regions in the SIMD's VRF
+            // to accommodate the WFs of the new WG that would be mapped to
+            // this SIMD unit
+            vregAvail = vrf[j]->manager->canAllocate(numWfsPerSimd[j],
+                                                     vregDemandPerWI);
+
+            // stop searching if there is at least one SIMD
+            // whose VRF does not have enough free VGPR pools.
+            // This is because a WG is scheduled only if ALL
+            // of its WFs can be scheduled
+            if (!vregAvail)
+                break;
+        }
+    }
+
+    DPRINTF(GPUDisp, "Free WF slots =  %d, VGPR Availability = %d\n",
+            freeWfSlots, vregAvail);
+
+    if (!vregAvail) {
+        ++numTimesWgBlockedDueVgprAlloc;
+    }
+
+    // Return true if enough WF slots to submit workgroup and if there are
+    // enough VGPRs to schedule all WFs to their SIMD units
+    if (!lds.canReserve(ndr->q.ldsSize)) {
+        wgBlockedDueLdsAllocation++;
+    }
+
+    // Return true if (a) there are enough free WF slots to submit
+    // workgrounp and (b) if there are enough VGPRs to schedule all WFs to their
+    // SIMD units and (c) if there is enough space in LDS
+    return freeWfSlots >= numWfs && vregAvail && lds.canReserve(ndr->q.ldsSize);
+}
+
+int
+ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
+{
+    DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id);
+    int ccnt = 0;
+
+    for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) {
+        for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
+            Wavefront *w = wfList[i_simd][i_wf];
+
+            if (w->status == Wavefront::S_RUNNING) {
+                DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
+
+                DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
+                        w->barrier_id, _barrier_id);
+
+                DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n",
+                        w->barrier_cnt, bcnt);
+            }
+
+            if (w->status == Wavefront::S_RUNNING &&
+                w->barrier_id == _barrier_id && w->barrier_cnt == bcnt &&
+                !w->outstanding_reqs) {
+                ++ccnt;
+
+                DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to "
+                        "%d\n", i_simd, i_wf, ccnt);
+            }
+        }
+    }
+
+    DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
+            cu_id, ccnt, bslots);
+
+    return ccnt == bslots;
+}
+
+//  Check if the current wavefront is blocked on additional resources.
+bool
+ComputeUnit::cedeSIMD(int simdId, int wfSlotId)
+{
+    bool cede = false;
+
+    // If --xact-cas-mode option is enabled in run.py, then xact_cas_ld
+    // magic instructions will impact the scheduling of wavefronts
+    if (xact_cas_mode) {
+        /*
+         * When a wavefront calls xact_cas_ld, it adds itself to a per address
+         * queue. All per address queues are managed by the xactCasLoadMap.
+         *
+         * A wavefront is not blocked if: it is not in ANY per address queue or
+         * if it is at the head of a per address queue.
+         */
+        for (auto itMap : xactCasLoadMap) {
+            std::list<waveIdentifier> curWaveIDQueue = itMap.second.waveIDQueue;
+
+            if (!curWaveIDQueue.empty()) {
+                for (auto it : curWaveIDQueue) {
+                    waveIdentifier cur_wave = it;
+
+                    if (cur_wave.simdId == simdId &&
+                        cur_wave.wfSlotId == wfSlotId) {
+                        // 2 possibilities
+                        // 1: this WF has a green light
+                        // 2: another WF has a green light
+                        waveIdentifier owner_wave = curWaveIDQueue.front();
+
+                        if (owner_wave.simdId != cur_wave.simdId ||
+                            owner_wave.wfSlotId != cur_wave.wfSlotId) {
+                            // possibility 2
+                            cede = true;
+                            break;
+                        } else {
+                            // possibility 1
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return cede;
+}
+
+// Execute one clock worth of work on the ComputeUnit.
+void
+ComputeUnit::exec()
+{
+    updateEvents();
+    // Execute pipeline stages in reverse order to simulate
+    // the pipeline latency
+    globalMemoryPipe.exec();
+    localMemoryPipe.exec();
+    execStage.exec();
+    scheduleStage.exec();
+    scoreboardCheckStage.exec();
+    fetchStage.exec();
+
+    totalCycles++;
+}
+
+void
+ComputeUnit::init()
+{
+    // Initialize CU Bus models
+    glbMemToVrfBus.init(&shader->tick_cnt, 1);
+    locMemToVrfBus.init(&shader->tick_cnt, 1);
+    nextGlbMemBus = 0;
+    nextLocMemBus = 0;
+    fatal_if(numGlbMemUnits > 1,
+             "No support for multiple Global Memory Pipelines exists!!!");
+    vrfToGlobalMemPipeBus.resize(numGlbMemUnits);
+    for (int j = 0; j < numGlbMemUnits; ++j) {
+        vrfToGlobalMemPipeBus[j] = WaitClass();
+        vrfToGlobalMemPipeBus[j].init(&shader->tick_cnt, 1);
+    }
+
+    fatal_if(numLocMemUnits > 1,
+             "No support for multiple Local Memory Pipelines exists!!!");
+    vrfToLocalMemPipeBus.resize(numLocMemUnits);
+    for (int j = 0; j < numLocMemUnits; ++j) {
+        vrfToLocalMemPipeBus[j] = WaitClass();
+        vrfToLocalMemPipeBus[j].init(&shader->tick_cnt, 1);
+    }
+    vectorRegsReserved.resize(numSIMDs, 0);
+    aluPipe.resize(numSIMDs);
+    wfWait.resize(numSIMDs + numLocMemUnits + numGlbMemUnits);
+
+    for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) {
+        wfWait[i] = WaitClass();
+        wfWait[i].init(&shader->tick_cnt, 1);
+    }
+
+    for (int i = 0; i < numSIMDs; ++i) {
+        aluPipe[i] = WaitClass();
+        aluPipe[i].init(&shader->tick_cnt, 1);
+    }
+
+    // Setup space for call args
+    for (int j = 0; j < numSIMDs; ++j) {
+        for (int i = 0; i < shader->n_wf; ++i) {
+            wfList[j][i]->initCallArgMem(shader->funcargs_size);
+        }
+    }
+
+    // Initializing pipeline resources
+    readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits);
+    waveStatusList.resize(numSIMDs);
+
+    for (int j = 0; j < numSIMDs; ++j) {
+        for (int i = 0; i < shader->n_wf; ++i) {
+            waveStatusList[j].push_back(
+                std::make_pair(wfList[j][i], BLOCKED));
+        }
+    }
+
+    for (int j = 0; j < (numSIMDs + numGlbMemUnits + numLocMemUnits); ++j) {
+        dispatchList.push_back(std::make_pair((Wavefront*)nullptr, EMPTY));
+    }
+
+    fetchStage.init(this);
+    scoreboardCheckStage.init(this);
+    scheduleStage.init(this);
+    execStage.init(this);
+    globalMemoryPipe.init(this);
+    localMemoryPipe.init(this);
+    // initialize state for statistics calculation
+    vectorAluInstAvail.resize(numSIMDs, false);
+    shrMemInstAvail = 0;
+    glbMemInstAvail = 0;
+}
+
+bool
+ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
+{
+    // Ruby has completed the memory op. Schedule the mem_resp_event at the
+    // appropriate cycle to process the timing memory response
+    // This delay represents the pipeline delay
+    SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+    int index = sender_state->port_index;
+    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+
+    // Is the packet returned a Kernel End or Barrier
+    if (pkt->req->isKernel() && pkt->req->isRelease()) {
+        Wavefront *w =
+            computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
+
+        // Check if we are waiting on Kernel End Release
+        if (w->status == Wavefront::S_RETURNING) {
+            DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
+                    computeUnit->cu_id, w->simdId, w->wfSlotId,
+                    w->wfDynId, w->kern_id);
+
+            computeUnit->shader->dispatcher->notifyWgCompl(w);
+            w->status = Wavefront::S_STOPPED;
+        } else {
+            w->outstanding_reqs--;
+        }
+
+        DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n",
+                computeUnit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, w->barrier_cnt);
+
+        if (gpuDynInst->useContinuation) {
+            assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+            gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+                                           gpuDynInst);
+        }
+
+        delete pkt->senderState;
+        delete pkt->req;
+        delete pkt;
+        return true;
+    } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
+        if (gpuDynInst->useContinuation) {
+            assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+            gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+                                           gpuDynInst);
+        }
+
+        delete pkt->senderState;
+        delete pkt->req;
+        delete pkt;
+        return true;
+    }
+
+    ComputeUnit::DataPort::MemRespEvent *mem_resp_event =
+        new ComputeUnit::DataPort::MemRespEvent(computeUnit->memPort[index],
+                                                pkt);
+
+    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
+            computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+            index, pkt->req->getPaddr());
+
+    computeUnit->schedule(mem_resp_event,
+                          curTick() + computeUnit->resp_tick_latency);
+    return true;
+}
+
+void
+ComputeUnit::DataPort::recvReqRetry()
+{
+    int len = retries.size();
+
+    assert(len > 0);
+
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = retries.front().first;
+        GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
+                computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+                pkt->req->getPaddr());
+
+        /** Currently Ruby can return false due to conflicts for the particular
+         *  cache block or address.  Thus other requests should be allowed to
+         *  pass and the data port should expect multiple retries. */
+        if (!sendTimingReq(pkt)) {
+            DPRINTF(GPUMem, "failed again!\n");
+            break;
+        } else {
+            DPRINTF(GPUMem, "successful!\n");
+            retries.pop_front();
+        }
+    }
+}
+
+bool
+ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
+{
+    computeUnit->fetchStage.processFetchReturn(pkt);
+
+    return true;
+}
+
+void
+ComputeUnit::SQCPort::recvReqRetry()
+{
+    int len = retries.size();
+
+    assert(len > 0);
+
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = retries.front().first;
+        Wavefront *wavefront M5_VAR_USED = retries.front().second;
+        DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
+                computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+                pkt->req->getPaddr());
+        if (!sendTimingReq(pkt)) {
+            DPRINTF(GPUFetch, "failed again!\n");
+            break;
+        } else {
+            DPRINTF(GPUFetch, "successful!\n");
+            retries.pop_front();
+        }
+    }
+}
+
+void
+ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
+{
+    // There must be a way around this check to do the globalMemStart...
+    Addr tmp_vaddr = pkt->req->getVaddr();
+
+    updatePageDivergenceDist(tmp_vaddr);
+
+    pkt->req->setVirt(pkt->req->getAsid(), tmp_vaddr, pkt->req->getSize(),
+                      pkt->req->getFlags(), pkt->req->masterId(),
+                      pkt->req->getPC());
+
+    // figure out the type of the request to set read/write
+    BaseTLB::Mode TLB_mode;
+    assert(pkt->isRead() || pkt->isWrite());
+
+    // Check write before read for atomic operations
+    // since atomic operations should use BaseTLB::Write
+    if (pkt->isWrite()){
+        TLB_mode = BaseTLB::Write;
+    } else if (pkt->isRead()) {
+        TLB_mode = BaseTLB::Read;
+    } else {
+        fatal("pkt is not a read nor a write\n");
+    }
+
+    tlbCycles -= curTick();
+    ++tlbRequests;
+
+    int tlbPort_index = perLaneTLB ? index : 0;
+
+    if (shader->timingSim) {
+        if (debugSegFault) {
+            Process *p = shader->gpuTc->getProcessPtr();
+            Addr vaddr = pkt->req->getVaddr();
+            unsigned size = pkt->getSize();
+
+            if ((vaddr + size - 1) % 64 < vaddr % 64) {
+                panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
+                      cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
+            }
+
+            Addr paddr;
+
+            if (!p->pTable->translate(vaddr, paddr)) {
+                if (!p->fixupStackFault(vaddr)) {
+                    panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
+                          cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+                          vaddr);
+                }
+            }
+        }
+
+        // This is the SenderState needed upon return
+        pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
+
+        // This is the senderState needed by the TLB hierarchy to function
+        TheISA::GpuTLB::TranslationState *translation_state =
+          new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
+                                               pkt->senderState);
+
+        pkt->senderState = translation_state;
+
+        if (functionalTLB) {
+            tlbPort[tlbPort_index]->sendFunctional(pkt);
+
+            // update the hitLevel distribution
+            int hit_level = translation_state->hitLevel;
+            assert(hit_level != -1);
+            hitsPerTLBLevel[hit_level]++;
+
+            // New SenderState for the memory access
+            X86ISA::GpuTLB::TranslationState *sender_state =
+                safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+            delete sender_state->tlbEntry;
+            delete sender_state->saved;
+            delete sender_state;
+
+            assert(pkt->req->hasPaddr());
+            assert(pkt->req->hasSize());
+
+            uint8_t *tmpData = pkt->getPtr<uint8_t>();
+
+            // this is necessary because the GPU TLB receives packets instead
+            // of requests. when the translation is complete, all relevent
+            // fields in the request will be populated, but not in the packet.
+            // here we create the new packet so we can set the size, addr,
+            // and proper flags.
+            PacketPtr oldPkt = pkt;
+            pkt = new Packet(oldPkt->req, oldPkt->cmd);
+            delete oldPkt;
+            pkt->dataStatic(tmpData);
+
+
+            // New SenderState for the memory access
+            pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst,
+                                                             index, nullptr);
+
+            gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
+            gpuDynInst->tlbHitLevel[index] = hit_level;
+
+
+            // translation is done. Schedule the mem_req_event at the
+            // appropriate cycle to send the timing memory request to ruby
+            ComputeUnit::DataPort::MemReqEvent *mem_req_event =
+                new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt);
+
+            DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
+                    "scheduled\n", cu_id, gpuDynInst->simdId,
+                    gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
+
+            schedule(mem_req_event, curTick() + req_tick_latency);
+        } else if (tlbPort[tlbPort_index]->isStalled()) {
+            assert(tlbPort[tlbPort_index]->retries.size() > 0);
+
+            DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
+                    "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+                    tmp_vaddr);
+
+            tlbPort[tlbPort_index]->retries.push_back(pkt);
+        } else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
+            // Stall the data port;
+            // No more packet will be issued till
+            // ruby indicates resources are freed by
+            // a recvReqRetry() call back on this port.
+            tlbPort[tlbPort_index]->stallPort();
+
+            DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
+                    "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+                    tmp_vaddr);
+
+            tlbPort[tlbPort_index]->retries.push_back(pkt);
+        } else {
+           DPRINTF(GPUTLB,
+                   "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
+                   cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
+        }
+    } else {
+        if (pkt->cmd == MemCmd::MemFenceReq) {
+            gpuDynInst->statusBitVector = VectorMask(0);
+        } else {
+            gpuDynInst->statusBitVector &= (~(1ll << index));
+        }
+
+        // New SenderState for the memory access
+        delete pkt->senderState;
+
+        // Because it's atomic operation, only need TLB translation state
+        pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode,
+                                                                shader->gpuTc);
+
+        tlbPort[tlbPort_index]->sendFunctional(pkt);
+
+        // the addr of the packet is not modified, so we need to create a new
+        // packet, or otherwise the memory access will have the old virtual
+        // address sent in the translation packet, instead of the physical
+        // address returned by the translation.
+        PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
+        new_pkt->dataStatic(pkt->getPtr<uint8_t>());
+
+        // Translation is done. It is safe to send the packet to memory.
+        memPort[0]->sendFunctional(new_pkt);
+
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
+                gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
+                new_pkt->req->getPaddr());
+
+        // safe_cast the senderState
+        TheISA::GpuTLB::TranslationState *sender_state =
+             safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+        delete sender_state->tlbEntry;
+        delete new_pkt;
+        delete pkt->senderState;
+        delete pkt->req;
+        delete pkt;
+    }
+}
+
+void
+ComputeUnit::sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
+{
+    ComputeUnit::DataPort::MemReqEvent *mem_req_event =
+        new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt);
+
+
+    // New SenderState for the memory access
+    pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
+                                                              nullptr);
+
+    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
+            cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
+            pkt->req->getPaddr());
+
+    schedule(mem_req_event, curTick() + req_tick_latency);
+}
+
+void
+ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch,
+                                  Request* req)
+{
+    if (!req) {
+        req = new Request(0, 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId, -1);
+    }
+    req->setPaddr(0);
+    if (kernelLaunch) {
+        req->setFlags(Request::KERNEL);
+    }
+
+    gpuDynInst->s_type = SEG_GLOBAL;
+
+    // for non-kernel MemFence operations, memorder flags are set depending
+    // on which type of request is currently being sent, so this
+    // should be set by the caller (e.g. if an inst has acq-rel
+    // semantics, it will send one acquire req an one release req)
+    gpuDynInst->setRequestFlags(req, kernelLaunch);
+
+    // a mem fence must correspond to an acquire/release request
+    assert(req->isAcquire() || req->isRelease());
+
+    // create packet
+    PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq);
+
+    // set packet's sender state
+    pkt->senderState =
+        new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr);
+
+    // send the packet
+    sendSyncRequest(gpuDynInst, 0, pkt);
+}
+
+const char*
+ComputeUnit::DataPort::MemRespEvent::description() const
+{
+    return "ComputeUnit memory response event";
+}
+
+void
+ComputeUnit::DataPort::MemRespEvent::process()
+{
+    DataPort::SenderState *sender_state =
+        safe_cast<DataPort::SenderState*>(pkt->senderState);
+
+    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+    ComputeUnit *compute_unit = dataPort->computeUnit;
+
+    assert(gpuDynInst);
+
+    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
+            compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+            pkt->req->getPaddr(), dataPort->index);
+
+    Addr paddr = pkt->req->getPaddr();
+
+    if (pkt->cmd != MemCmd::MemFenceResp) {
+        int index = gpuDynInst->memStatusVector[paddr].back();
+
+        DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
+                pkt->req->getPaddr(), index);
+
+        gpuDynInst->memStatusVector[paddr].pop_back();
+        gpuDynInst->pAddr = pkt->req->getPaddr();
+
+        if (pkt->isRead() || pkt->isWrite()) {
+
+            if (gpuDynInst->n_reg <= MAX_REGS_FOR_NON_VEC_MEM_INST) {
+                gpuDynInst->statusBitVector &= (~(1ULL << index));
+            } else {
+                assert(gpuDynInst->statusVector[index] > 0);
+                gpuDynInst->statusVector[index]--;
+
+                if (!gpuDynInst->statusVector[index])
+                    gpuDynInst->statusBitVector &= (~(1ULL << index));
+            }
+
+            DPRINTF(GPUMem, "bitvector is now %#x\n",
+                    gpuDynInst->statusBitVector);
+
+            if (gpuDynInst->statusBitVector == VectorMask(0)) {
+                auto iter = gpuDynInst->memStatusVector.begin();
+                auto end = gpuDynInst->memStatusVector.end();
+
+                while (iter != end) {
+                    assert(iter->second.empty());
+                    ++iter;
+                }
+
+                gpuDynInst->memStatusVector.clear();
+
+                if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
+                    gpuDynInst->statusVector.clear();
+
+                if (gpuDynInst->m_op == Enums::MO_LD || MO_A(gpuDynInst->m_op)
+                    || MO_ANR(gpuDynInst->m_op)) {
+                    assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy());
+
+                    compute_unit->globalMemoryPipe.getGMLdRespFIFO()
+                        .push(gpuDynInst);
+                } else {
+                    assert(compute_unit->globalMemoryPipe.isGMStRespFIFOWrRdy());
+
+                    compute_unit->globalMemoryPipe.getGMStRespFIFO()
+                        .push(gpuDynInst);
+                }
+
+                DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
+                        compute_unit->cu_id, gpuDynInst->simdId,
+                        gpuDynInst->wfSlotId);
+
+                // after clearing the status vectors,
+                // see if there is a continuation to perform
+                // the continuation may generate more work for
+                // this memory request
+                if (gpuDynInst->useContinuation) {
+                    assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+                    gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+                                                 gpuDynInst);
+                }
+            }
+        }
+    } else {
+        gpuDynInst->statusBitVector = VectorMask(0);
+
+        if (gpuDynInst->useContinuation) {
+            assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+            gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+                                         gpuDynInst);
+        }
+    }
+
+    delete pkt->senderState;
+    delete pkt->req;
+    delete pkt;
+}
+
+ComputeUnit*
+ComputeUnitParams::create()
+{
+    return new ComputeUnit(this);
+}
+
+bool
+ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
+{
+    Addr line = pkt->req->getPaddr();
+
+    DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
+            pkt->req->getVaddr(), line);
+
+    assert(pkt->senderState);
+    computeUnit->tlbCycles += curTick();
+
+    // pop off the TLB translation state
+    TheISA::GpuTLB::TranslationState *translation_state =
+               safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    // no PageFaults are permitted for data accesses
+    if (!translation_state->tlbEntry->valid) {
+        DTLBPort::SenderState *sender_state =
+            safe_cast<DTLBPort::SenderState*>(translation_state->saved);
+
+        Wavefront *w M5_VAR_USED =
+            computeUnit->wfList[sender_state->_gpuDynInst->simdId]
+            [sender_state->_gpuDynInst->wfSlotId];
+
+        DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
+                 pkt->req->getVaddr());
+    }
+
+    assert(translation_state->tlbEntry->valid);
+
+    // update the hitLevel distribution
+    int hit_level = translation_state->hitLevel;
+    computeUnit->hitsPerTLBLevel[hit_level]++;
+
+    delete translation_state->tlbEntry;
+    assert(!translation_state->ports.size());
+    pkt->senderState = translation_state->saved;
+
+    // for prefetch pkt
+    BaseTLB::Mode TLB_mode = translation_state->tlbMode;
+
+    delete translation_state;
+
+    // use the original sender state to know how to close this transaction
+    DTLBPort::SenderState *sender_state =
+        safe_cast<DTLBPort::SenderState*>(pkt->senderState);
+
+    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+    int mp_index = sender_state->portIndex;
+    Addr vaddr = pkt->req->getVaddr();
+    gpuDynInst->memStatusVector[line].push_back(mp_index);
+    gpuDynInst->tlbHitLevel[mp_index] = hit_level;
+
+    MemCmd requestCmd;
+
+    if (pkt->cmd == MemCmd::ReadResp) {
+        requestCmd = MemCmd::ReadReq;
+    } else if (pkt->cmd == MemCmd::WriteResp) {
+        requestCmd = MemCmd::WriteReq;
+    } else if (pkt->cmd == MemCmd::SwapResp) {
+        requestCmd = MemCmd::SwapReq;
+    } else {
+        panic("unsupported response to request conversion %s\n",
+              pkt->cmd.toString());
+    }
+
+    if (computeUnit->prefetchDepth) {
+        int simdId = gpuDynInst->simdId;
+        int wfSlotId = gpuDynInst->wfSlotId;
+        Addr last = 0;
+
+        switch(computeUnit->prefetchType) {
+          case Enums::PF_CU:
+            last = computeUnit->lastVaddrCU[mp_index];
+            break;
+          case Enums::PF_PHASE:
+            last = computeUnit->lastVaddrPhase[simdId][mp_index];
+            break;
+          case Enums::PF_WF:
+            last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
+          default:
+            break;
+        }
+
+        DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
+                computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
+
+        int stride = last ? (roundDown(vaddr, TheISA::PageBytes) -
+                     roundDown(last, TheISA::PageBytes)) >> TheISA::PageShift
+                     : 0;
+
+        DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
+
+        computeUnit->lastVaddrCU[mp_index] = vaddr;
+        computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr;
+        computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
+
+        stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
+            computeUnit->prefetchStride: stride;
+
+        DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
+                computeUnit->cu_id, simdId, wfSlotId, mp_index);
+
+        DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
+
+        // Prefetch Next few pages atomically
+        for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
+            DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
+                    vaddr+stride*pf*TheISA::PageBytes);
+
+            if (!stride)
+                break;
+
+            Request *prefetch_req = new Request(0, vaddr + stride * pf *
+                                                TheISA::PageBytes,
+                                                sizeof(uint8_t), 0,
+                                                computeUnit->masterId(),
+                                                0, 0, 0);
+
+            PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
+            uint8_t foo = 0;
+            prefetch_pkt->dataStatic(&foo);
+
+            // Because it's atomic operation, only need TLB translation state
+            prefetch_pkt->senderState =
+                new TheISA::GpuTLB::TranslationState(TLB_mode,
+                                                     computeUnit->shader->gpuTc,
+                                                     true);
+
+            // Currently prefetches are zero-latency, hence the sendFunctional
+            sendFunctional(prefetch_pkt);
+
+            /* safe_cast the senderState */
+            TheISA::GpuTLB::TranslationState *tlb_state =
+                 safe_cast<TheISA::GpuTLB::TranslationState*>(
+                         prefetch_pkt->senderState);
+
+
+            delete tlb_state->tlbEntry;
+            delete tlb_state;
+            delete prefetch_pkt->req;
+            delete prefetch_pkt;
+        }
+    }
+
+    // First we must convert the response cmd back to a request cmd so that
+    // the request can be sent through the cu's master port
+    PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
+    new_pkt->dataStatic(pkt->getPtr<uint8_t>());
+    delete pkt->senderState;
+    delete pkt;
+
+    // New SenderState for the memory access
+    new_pkt->senderState =
+            new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
+                                                   nullptr);
+
+    // translation is done. Schedule the mem_req_event at the appropriate
+    // cycle to send the timing memory request to ruby
+    ComputeUnit::DataPort::MemReqEvent *mem_req_event =
+        new ComputeUnit::DataPort::MemReqEvent(computeUnit->memPort[mp_index],
+                                               new_pkt);
+
+    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
+            computeUnit->cu_id, gpuDynInst->simdId,
+            gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
+
+    computeUnit->schedule(mem_req_event, curTick() +
+                          computeUnit->req_tick_latency);
+
+    return true;
+}
+
+const char*
+ComputeUnit::DataPort::MemReqEvent::description() const
+{
+    return "ComputeUnit memory request event";
+}
+
+void
+ComputeUnit::DataPort::MemReqEvent::process()
+{
+    SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+    ComputeUnit *compute_unit M5_VAR_USED = dataPort->computeUnit;
+
+    if (!(dataPort->sendTimingReq(pkt))) {
+        dataPort->retries.push_back(std::make_pair(pkt, gpuDynInst));
+
+        DPRINTF(GPUPort,
+                "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
+                compute_unit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, dataPort->index,
+                pkt->req->getPaddr());
+    } else {
+        DPRINTF(GPUPort,
+                "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
+                compute_unit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, dataPort->index,
+                pkt->req->getPaddr());
+    }
+}
+
+/*
+ * The initial translation request could have been rejected,
+ * if <retries> queue is not Retry sending the translation
+ * request. sendRetry() is called from the peer port whenever
+ * a translation completes.
+ */
+void
+ComputeUnit::DTLBPort::recvReqRetry()
+{
+    int len = retries.size();
+
+    DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
+            computeUnit->cu_id, len);
+
+    assert(len > 0);
+    assert(isStalled());
+    // recvReqRetry is an indication that the resource on which this
+    // port was stalling on is freed. So, remove the stall first
+    unstallPort();
+
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = retries.front();
+        Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
+        DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
+
+        if (!sendTimingReq(pkt)) {
+            // Stall port
+            stallPort();
+            DPRINTF(GPUTLB, ": failed again\n");
+            break;
+        } else {
+            DPRINTF(GPUTLB, ": successful\n");
+            retries.pop_front();
+        }
+    }
+}
+
+bool
+ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt)
+{
+    Addr line M5_VAR_USED = pkt->req->getPaddr();
+    DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
+            computeUnit->cu_id, pkt->req->getVaddr(), line);
+
+    assert(pkt->senderState);
+
+    // pop off the TLB translation state
+    TheISA::GpuTLB::TranslationState *translation_state =
+                 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    bool success = translation_state->tlbEntry->valid;
+    delete translation_state->tlbEntry;
+    assert(!translation_state->ports.size());
+    pkt->senderState = translation_state->saved;
+    delete translation_state;
+
+    // use the original sender state to know how to close this transaction
+    ITLBPort::SenderState *sender_state =
+        safe_cast<ITLBPort::SenderState*>(pkt->senderState);
+
+    // get the wavefront associated with this translation request
+    Wavefront *wavefront = sender_state->wavefront;
+    delete pkt->senderState;
+
+    if (success) {
+        // pkt is reused in fetch(), don't delete it here.  However, we must
+        // reset the command to be a request so that it can be sent through
+        // the cu's master port
+        assert(pkt->cmd == MemCmd::ReadResp);
+        pkt->cmd = MemCmd::ReadReq;
+
+        computeUnit->fetchStage.fetch(pkt, wavefront);
+    } else {
+        if (wavefront->dropFetch) {
+            assert(wavefront->instructionBuffer.empty());
+            wavefront->dropFetch = false;
+        }
+
+        wavefront->pendingFetch = 0;
+    }
+
+    return true;
+}
+
+/*
+ * The initial translation request could have been rejected, if
+ * <retries> queue is not empty. Retry sending the translation
+ * request. sendRetry() is called from the peer port whenever
+ * a translation completes.
+ */
+void
+ComputeUnit::ITLBPort::recvReqRetry()
+{
+
+    int len = retries.size();
+    DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
+
+    assert(len > 0);
+    assert(isStalled());
+
+    // recvReqRetry is an indication that the resource on which this
+    // port was stalling on is freed. So, remove the stall first
+    unstallPort();
+
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = retries.front();
+        Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
+        DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
+
+        if (!sendTimingReq(pkt)) {
+            stallPort(); // Stall port
+            DPRINTF(GPUTLB, ": failed again\n");
+            break;
+        } else {
+            DPRINTF(GPUTLB, ": successful\n");
+            retries.pop_front();
+        }
+    }
+}
+
+void
+ComputeUnit::regStats()
+{
+    tlbCycles
+        .name(name() + ".tlb_cycles")
+        .desc("total number of cycles for all uncoalesced requests")
+        ;
+
+    tlbRequests
+        .name(name() + ".tlb_requests")
+        .desc("number of uncoalesced requests")
+        ;
+
+    tlbLatency
+        .name(name() + ".avg_translation_latency")
+        .desc("Avg. translation latency for data translations")
+        ;
+
+    tlbLatency = tlbCycles / tlbRequests;
+
+    hitsPerTLBLevel
+       .init(4)
+       .name(name() + ".TLB_hits_distribution")
+       .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
+       ;
+
+    // fixed number of TLB levels
+    for (int i = 0; i < 4; ++i) {
+        if (!i)
+            hitsPerTLBLevel.subname(i,"page_table");
+        else
+            hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
+    }
+
+    execRateDist
+        .init(0, 10, 2)
+        .name(name() + ".inst_exec_rate")
+        .desc("Instruction Execution Rate: Number of executed vector "
+              "instructions per cycle")
+        ;
+
+    ldsBankConflictDist
+       .init(0, VSZ, 2)
+       .name(name() + ".lds_bank_conflicts")
+       .desc("Number of bank conflicts per LDS memory packet")
+       ;
+
+    ldsBankAccesses
+        .name(name() + ".lds_bank_access_cnt")
+        .desc("Total number of LDS bank accesses")
+        ;
+
+    pageDivergenceDist
+       // A wavefront can touch 1 to VSZ pages per memory instruction.
+       // The number of pages per bin can be configured (here it's 4).
+       .init(1, VSZ, 4)
+       .name(name() + ".page_divergence_dist")
+       .desc("pages touched per wf (over all mem. instr.)")
+       ;
+
+    controlFlowDivergenceDist
+        .init(1, VSZ, 4)
+        .name(name() + ".warp_execution_dist")
+        .desc("number of lanes active per instruction (oval all instructions)")
+        ;
+
+    activeLanesPerGMemInstrDist
+        .init(1, VSZ, 4)
+        .name(name() + ".gmem_lanes_execution_dist")
+        .desc("number of active lanes per global memory instruction")
+        ;
+
+    activeLanesPerLMemInstrDist
+        .init(1, VSZ, 4)
+        .name(name() + ".lmem_lanes_execution_dist")
+        .desc("number of active lanes per local memory instruction")
+        ;
+
+    numInstrExecuted
+        .name(name() + ".num_instr_executed")
+        .desc("number of instructions executed")
+        ;
+
+    numVecOpsExecuted
+        .name(name() + ".num_vec_ops_executed")
+        .desc("number of vec ops executed (e.g. VSZ/inst)")
+        ;
+
+    totalCycles
+        .name(name() + ".num_total_cycles")
+        .desc("number of cycles the CU ran for")
+        ;
+
+    ipc
+        .name(name() + ".ipc")
+        .desc("Instructions per cycle (this CU only)")
+        ;
+
+    vpc
+        .name(name() + ".vpc")
+        .desc("Vector Operations per cycle (this CU only)")
+        ;
+
+    numALUInstsExecuted
+        .name(name() + ".num_alu_insts_executed")
+        .desc("Number of dynamic non-GM memory insts executed")
+        ;
+
+    wgBlockedDueLdsAllocation
+        .name(name() + ".wg_blocked_due_lds_alloc")
+        .desc("Workgroup blocked due to LDS capacity")
+        ;
+
+    ipc = numInstrExecuted / totalCycles;
+    vpc = numVecOpsExecuted / totalCycles;
+
+    numTimesWgBlockedDueVgprAlloc
+        .name(name() + ".times_wg_blocked_due_vgpr_alloc")
+        .desc("Number of times WGs are blocked due to VGPR allocation per SIMD")
+        ;
+
+    dynamicGMemInstrCnt
+        .name(name() + ".global_mem_instr_cnt")
+        .desc("dynamic global memory instructions count")
+        ;
+
+    dynamicLMemInstrCnt
+        .name(name() + ".local_mem_instr_cnt")
+        .desc("dynamic local memory intruction count")
+        ;
+
+    numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
+        dynamicLMemInstrCnt;
+
+    completedWfs
+        .name(name() + ".num_completed_wfs")
+        .desc("number of completed wavefronts")
+        ;
+
+    numCASOps
+        .name(name() + ".num_CAS_ops")
+        .desc("number of compare and swap operations")
+        ;
+
+    numFailedCASOps
+        .name(name() + ".num_failed_CAS_ops")
+        .desc("number of compare and swap operations that failed")
+        ;
+
+    // register stats of pipeline stages
+    fetchStage.regStats();
+    scoreboardCheckStage.regStats();
+    scheduleStage.regStats();
+    execStage.regStats();
+
+    // register stats of memory pipeline
+    globalMemoryPipe.regStats();
+    localMemoryPipe.regStats();
+}
+
+void
+ComputeUnit::updatePageDivergenceDist(Addr addr)
+{
+    Addr virt_page_addr = roundDown(addr, TheISA::PageBytes);
+
+    if (!pagesTouched.count(virt_page_addr))
+        pagesTouched[virt_page_addr] = 1;
+    else
+        pagesTouched[virt_page_addr]++;
+}
+
+void
+ComputeUnit::CUExitCallback::process()
+{
+    if (computeUnit->countPages) {
+        std::ostream *page_stat_file =
+            simout.create(computeUnit->name().c_str());
+
+        *page_stat_file << "page, wavefront accesses, workitem accesses" <<
+            std::endl;
+
+        for (auto iter : computeUnit->pageAccesses) {
+            *page_stat_file << std::hex << iter.first << ",";
+            *page_stat_file << std::dec << iter.second.first << ",";
+            *page_stat_file << std::dec << iter.second.second << std::endl;
+        }
+    }
+ }
+
+bool
+ComputeUnit::isDone() const
+{
+    for (int i = 0; i < numSIMDs; ++i) {
+        if (!isSimdDone(i)) {
+            return false;
+        }
+    }
+
+    bool glbMemBusRdy = true;
+    for (int j = 0; j < numGlbMemUnits; ++j) {
+        glbMemBusRdy &= vrfToGlobalMemPipeBus[j].rdy();
+    }
+    bool locMemBusRdy = true;
+    for (int j = 0; j < numLocMemUnits; ++j) {
+        locMemBusRdy &= vrfToLocalMemPipeBus[j].rdy();
+    }
+
+    if (!globalMemoryPipe.isGMLdRespFIFOWrRdy() ||
+        !globalMemoryPipe.isGMStRespFIFOWrRdy() ||
+        !globalMemoryPipe.isGMReqFIFOWrRdy() || !localMemoryPipe.isLMReqFIFOWrRdy()
+        || !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() ||
+        !glbMemToVrfBus.rdy() || !locMemBusRdy || !glbMemBusRdy) {
+        return false;
+    }
+
+    return true;
+}
+
+int32_t
+ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
+{
+    return lds.getRefCounter(dispatchId, wgId);
+}
+
+bool
+ComputeUnit::isSimdDone(uint32_t simdId) const
+{
+    assert(simdId < numSIMDs);
+
+    for (int i=0; i < numGlbMemUnits; ++i) {
+        if (!vrfToGlobalMemPipeBus[i].rdy())
+            return false;
+    }
+    for (int i=0; i < numLocMemUnits; ++i) {
+        if (!vrfToLocalMemPipeBus[i].rdy())
+            return false;
+    }
+    if (!aluPipe[simdId].rdy()) {
+        return false;
+    }
+
+    for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
+        if (wfList[simdId][i_wf]->status != Wavefront::S_STOPPED) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+/**
+ * send a general request to the LDS
+ * make sure to look at the return value here as your request might be
+ * NACK'd and returning false means that you have to have some backup plan
+ */
+bool
+ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst)
+{
+    // this is just a request to carry the GPUDynInstPtr
+    // back and forth
+    Request *newRequest = new Request();
+    newRequest->setPaddr(0x0);
+
+    // ReadReq is not evaluted by the LDS but the Packet ctor requires this
+    PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
+
+    // This is the SenderState needed upon return
+    newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
+
+    return ldsPort->sendTimingReq(newPacket);
+}
+
+/**
+ * get the result of packets sent to the LDS when they return
+ */
+bool
+ComputeUnit::LDSPort::recvTimingResp(PacketPtr packet)
+{
+    const ComputeUnit::LDSPort::SenderState *senderState =
+        dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
+
+    fatal_if(!senderState, "did not get the right sort of sender state");
+
+    GPUDynInstPtr gpuDynInst = senderState->getMemInst();
+
+    delete packet->senderState;
+    delete packet->req;
+    delete packet;
+
+    computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
+    return true;
+}
+
+/**
+ * attempt to send this packet, either the port is already stalled, the request
+ * is nack'd and must stall or the request goes through
+ * when a request cannot be sent, add it to the retries queue
+ */
+bool
+ComputeUnit::LDSPort::sendTimingReq(PacketPtr pkt)
+{
+    ComputeUnit::LDSPort::SenderState *sender_state =
+            dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
+    fatal_if(!sender_state, "packet without a valid sender state");
+
+    GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst();
+
+    if (isStalled()) {
+        fatal_if(retries.empty(), "must have retries waiting to be stalled");
+
+        retries.push(pkt);
+
+        DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
+                        computeUnit->cu_id, gpuDynInst->simdId,
+                        gpuDynInst->wfSlotId);
+        return false;
+    } else if (!MasterPort::sendTimingReq(pkt)) {
+        // need to stall the LDS port until a recvReqRetry() is received
+        // this indicates that there is more space
+        stallPort();
+        retries.push(pkt);
+
+        DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
+                computeUnit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, pkt->req->getPaddr());
+        return false;
+    } else {
+        DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
+                computeUnit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, pkt->req->getPaddr());
+        return true;
+    }
+}
+
+/**
+ * the bus is telling the port that there is now space so retrying stalled
+ * requests should work now
+ * this allows the port to have a request be nack'd and then have the receiver
+ * say when there is space, rather than simply retrying the send every cycle
+ */
+void
+ComputeUnit::LDSPort::recvReqRetry()
+{
+    auto queueSize = retries.size();
+
+    DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
+            computeUnit->cu_id, queueSize);
+
+    fatal_if(queueSize < 1,
+             "why was there a recvReqRetry() with no pending reqs?");
+    fatal_if(!isStalled(),
+             "recvReqRetry() happened when the port was not stalled");
+
+    unstallPort();
+
+    while (!retries.empty()) {
+        PacketPtr packet = retries.front();
+
+        DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
+
+        if (!MasterPort::sendTimingReq(packet)) {
+            // Stall port
+            stallPort();
+            DPRINTF(GPUPort, ": LDS send failed again\n");
+            break;
+        } else {
+            DPRINTF(GPUTLB, ": LDS send successful\n");
+            retries.pop();
+        }
+    }
+}
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
new file mode 100644
index 000000000..f47c27a0a
--- /dev/null
+++ b/src/gpu-compute/compute_unit.hh
@@ -0,0 +1,767 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Anthony Gutierrez
+ */
+
+#ifndef __COMPUTE_UNIT_HH__
+#define __COMPUTE_UNIT_HH__
+
+#include <deque>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#include "base/callback.hh"
+#include "base/statistics.hh"
+#include "base/types.hh"
+#include "enums/PrefetchType.hh"
+#include "gpu-compute/exec_stage.hh"
+#include "gpu-compute/fetch_stage.hh"
+#include "gpu-compute/global_memory_pipeline.hh"
+#include "gpu-compute/local_memory_pipeline.hh"
+#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/schedule_stage.hh"
+#include "gpu-compute/scoreboard_check_stage.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+
+static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
+static const int MAX_WIDTH_FOR_MEM_INST = 32;
+
+class NDRange;
+class Shader;
+class VectorRegisterFile;
+
+struct ComputeUnitParams;
+
+enum EXEC_POLICY
+{
+    OLDEST = 0,
+    RR
+};
+
+// List of execution units
+enum EXEC_UNIT
+{
+    SIMD0 = 0,
+    SIMD1,
+    SIMD2,
+    SIMD3,
+    GLBMEM_PIPE,
+    LDSMEM_PIPE,
+    NUM_UNITS
+};
+
+enum TLB_CACHE
+{
+    TLB_MISS_CACHE_MISS = 0,
+    TLB_MISS_CACHE_HIT,
+    TLB_HIT_CACHE_MISS,
+    TLB_HIT_CACHE_HIT
+};
+
+class ComputeUnit : public MemObject
+{
+  public:
+    FetchStage fetchStage;
+    ScoreboardCheckStage scoreboardCheckStage;
+    ScheduleStage scheduleStage;
+    ExecStage execStage;
+    GlobalMemPipeline globalMemoryPipe;
+    LocalMemPipeline localMemoryPipe;
+
+    // Buffers used to communicate between various pipeline stages
+
+    // List of waves which are ready to be scheduled.
+    // Each execution resource has a ready list. readyList is
+    // used to communicate between scoreboardCheck stage and
+    // schedule stage
+    // TODO: make enum to index readyList
+    std::vector<std::vector<Wavefront*>> readyList;
+
+    // Stores the status of waves. A READY implies the
+    // wave is ready to be scheduled this cycle and
+    // is already present in the readyList. waveStatusList is
+    // used to communicate between scoreboardCheck stage and
+    // schedule stage
+    // TODO: convert std::pair to a class to increase readability
+    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
+
+    // List of waves which will be dispatched to
+    // each execution resource. A FILLED implies
+    // dispatch list is non-empty and
+    // execution unit has something to execute
+    // this cycle. Currently, the dispatch list of
+    // an execution resource can hold only one wave because
+    // an execution resource can execute only one wave in a cycle.
+    // dispatchList is used to communicate between schedule
+    // and exec stage
+    // TODO: convert std::pair to a class to increase readability
+    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
+
+    int rrNextMemID; // used by RR WF exec policy to cycle through WF's
+    int rrNextALUWp;
+    typedef ComputeUnitParams Params;
+    std::vector<std::vector<Wavefront*>> wfList;
+    int cu_id;
+
+    // array of vector register files, one per SIMD
+    std::vector<VectorRegisterFile*> vrf;
+    // Number of vector ALU units (SIMDs) in CU
+    int numSIMDs;
+    // number of pipe stages for bypassing data to next dependent single
+    // precision vector instruction inside the vector ALU pipeline
+    int spBypassPipeLength;
+    // number of pipe stages for bypassing data to next dependent double
+    // precision vector instruction inside the vector ALU pipeline
+    int dpBypassPipeLength;
+    // number of cycles per issue period
+    int issuePeriod;
+
+    // Number of global and local memory execution resources in CU
+    int numGlbMemUnits;
+    int numLocMemUnits;
+    // tracks the last cycle a vector instruction was executed on a SIMD
+    std::vector<uint64_t> lastExecCycle;
+
+    // true if we allow a separate TLB per lane
+    bool perLaneTLB;
+    // if 0, TLB prefetching is off.
+    int prefetchDepth;
+    // if fixed-stride prefetching, this is the stride.
+    int prefetchStride;
+
+    class LastVaddrWave
+    {
+      public:
+        Addr vaddrs[VSZ];
+        Addr& operator[](int idx) {
+            return vaddrs[idx];
+        }
+
+        LastVaddrWave() {
+            for (int i = 0; i < VSZ; ++i)
+                vaddrs[i] = 0;
+        }
+    };
+
+    LastVaddrWave lastVaddrCU;
+    std::vector<LastVaddrWave> lastVaddrPhase;
+    std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
+    Enums::PrefetchType prefetchType;
+    EXEC_POLICY exec_policy;
+
+    bool xact_cas_mode;
+    bool debugSegFault;
+    bool functionalTLB;
+    bool localMemBarrier;
+
+    /*
+     * for Counting page accesses
+     *
+     * cuExitCallback inherits from Callback. When you register a callback
+     * function as an exit callback, it will get added to an exit callback
+     * queue, such that on simulation exit, all callbacks in the callback
+     * queue will have their process() function called.
+     */
+    bool countPages;
+
+    Shader *shader;
+    uint32_t barrier_id;
+    // vector of Vector ALU (MACC) pipelines
+    std::vector<WaitClass> aluPipe;
+    // minimum issue period per SIMD unit (in cycles)
+    std::vector<WaitClass> wfWait;
+
+    // Resource control for Vector Register File->Global Memory pipe buses
+    std::vector<WaitClass> vrfToGlobalMemPipeBus;
+    // Resource control for Vector Register File->Local Memory pipe buses
+    std::vector<WaitClass> vrfToLocalMemPipeBus;
+    int nextGlbMemBus;
+    int nextLocMemBus;
+    // Resource control for global memory to VRF data/address bus
+    WaitClass glbMemToVrfBus;
+    // Resource control for local memory to VRF data/address bus
+    WaitClass locMemToVrfBus;
+
+    uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
+    uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
+    uint32_t numCyclesPerStoreTransfer;  // number of cycles per vector store
+    uint32_t numCyclesPerLoadTransfer;  // number of cycles per vector load
+
+    Tick req_tick_latency;
+    Tick resp_tick_latency;
+
+    // number of vector registers being reserved for each SIMD unit
+    std::vector<int> vectorRegsReserved;
+    // number of vector registers per SIMD unit
+    uint32_t numVecRegsPerSimd;
+    // Support for scheduling VGPR status update events
+    std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
+    std::vector<uint64_t> timestampVec;
+    std::vector<uint8_t>  statusVec;
+
+    void
+    registerEvent(uint32_t simdId,
+                  uint32_t regIdx,
+                  uint32_t operandSize,
+                  uint64_t when,
+                  uint8_t newStatus) {
+        regIdxVec.push_back(std::make_pair(simdId, regIdx));
+        timestampVec.push_back(when);
+        statusVec.push_back(newStatus);
+        if (operandSize > 4) {
+            regIdxVec.push_back(std::make_pair(simdId,
+                                               ((regIdx + 1) %
+                                                numVecRegsPerSimd)));
+            timestampVec.push_back(when);
+            statusVec.push_back(newStatus);
+        }
+    }
+
+    void updateEvents();
+
+    // this hash map will keep track of page divergence
+    // per memory instruction per wavefront. The hash map
+    // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
+    std::map<Addr, int> pagesTouched;
+
+    ComputeUnit(const Params *p);
+    ~ComputeUnit();
+    int spBypassLength() { return spBypassPipeLength; };
+    int dpBypassLength() { return dpBypassPipeLength; };
+    int storeBusLength() { return numCyclesPerStoreTransfer; };
+    int loadBusLength() { return numCyclesPerLoadTransfer; };
+    int wfSize() const { return wavefrontSize; };
+
+    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+    void exec();
+    void initiateFetch(Wavefront *wavefront);
+    void fetch(PacketPtr pkt, Wavefront *wavefront);
+    void FillKernelState(Wavefront *w, NDRange *ndr);
+
+    void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
+                 int trueWgSizeTotal);
+
+    void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
+                             int trueWgSize[], int trueWgSizeTotal,
+                             LdsChunk *ldsChunk, uint64_t origSpillMemStart);
+
+    void StartWorkgroup(NDRange *ndr);
+    int ReadyWorkgroup(NDRange *ndr);
+
+    bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
+    bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
+    bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
+    int GlbMemUnitId() { return GLBMEM_PIPE; }
+    int ShrMemUnitId() { return LDSMEM_PIPE; }
+    int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
+    int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
+    /* This function cycles through all the wavefronts in all the phases to see
+     * if all of the wavefronts which should be associated with one barrier
+     * (denoted with _barrier_id), are all at the same barrier in the program
+     * (denoted by bcnt). When the number at the barrier matches bslots, then
+     * return true.
+     */
+    int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
+    bool cedeSIMD(int simdId, int wfSlotId);
+
+    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
+    virtual void init();
+    void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
+    void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
+    void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
+                              bool kernelLaunch=true,
+                              RequestPtr req=nullptr);
+    void handleMemPacket(PacketPtr pkt, int memport_index);
+    bool processTimingPacket(PacketPtr pkt);
+    void processFetchReturn(PacketPtr pkt);
+    void updatePageDivergenceDist(Addr addr);
+
+    MasterID masterId() { return _masterId; }
+
+    bool isDone() const;
+    bool isSimdDone(uint32_t) const;
+
+  protected:
+    MasterID _masterId;
+
+    LdsState &lds;
+
+  public:
+    // the following stats compute the avg. TLB accesslatency per
+    // uncoalesced request (only for data)
+    Stats::Scalar tlbRequests;
+    Stats::Scalar tlbCycles;
+    Stats::Formula tlbLatency;
+    // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
+    Stats::Vector hitsPerTLBLevel;
+
+    Stats::Scalar ldsBankAccesses;
+    Stats::Distribution ldsBankConflictDist;
+
+    // over all memory instructions executed over all wavefronts
+    // how many touched 0-4 pages, 4-8, ..., 60-64 pages
+    Stats::Distribution pageDivergenceDist;
+    Stats::Scalar dynamicGMemInstrCnt;
+    Stats::Scalar dynamicLMemInstrCnt;
+
+    Stats::Scalar wgBlockedDueLdsAllocation;
+    // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
+    // when the instruction is committed, this number is still incremented by 1
+    Stats::Scalar numInstrExecuted;
+    // Number of cycles among successive instruction executions across all
+    // wavefronts of the same CU
+    Stats::Distribution execRateDist;
+    // number of individual vector operations executed
+    Stats::Scalar numVecOpsExecuted;
+    // Total cycles that something is running on the GPU
+    Stats::Scalar totalCycles;
+    Stats::Formula vpc; // vector ops per cycle
+    Stats::Formula ipc; // vector instructions per cycle
+    Stats::Distribution controlFlowDivergenceDist;
+    Stats::Distribution activeLanesPerGMemInstrDist;
+    Stats::Distribution activeLanesPerLMemInstrDist;
+    // number of vector ALU instructions received
+    Stats::Formula numALUInstsExecuted;
+    // number of times a WG can not start due to lack of free VGPRs in SIMDs
+    Stats::Scalar numTimesWgBlockedDueVgprAlloc;
+    Stats::Scalar numCASOps;
+    Stats::Scalar numFailedCASOps;
+    Stats::Scalar completedWfs;
+    // flag per vector SIMD unit that is set when there is at least one
+    // WV that has a vector ALU instruction as the oldest in its
+    // Instruction Buffer: Defined in the Scoreboard stage, consumed
+    // by the Execute stage.
+    std::vector<bool> vectorAluInstAvail;
+    // number of available (oldest) LDS instructions that could have
+    // been issued to the LDS at a specific issue slot
+    int shrMemInstAvail;
+    // number of available Global memory instructions that could have
+    // been issued to TCP at a specific issue slot
+    int glbMemInstAvail;
+
+    void
+    regStats();
+
+    LdsState &
+    getLds() const
+    {
+        return lds;
+    }
+
+    int32_t
+    getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
+
+    bool
+    sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
+
+    typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
+    pageDataStruct pageAccesses;
+
+    class CUExitCallback : public Callback
+    {
+      private:
+        ComputeUnit *computeUnit;
+
+      public:
+        virtual ~CUExitCallback() { }
+
+        CUExitCallback(ComputeUnit *_cu)
+        {
+            computeUnit = _cu;
+        }
+
+        virtual void
+        process();
+    };
+
+    CUExitCallback *cuExitCallback;
+
+    /** Data access Port **/
+    class DataPort : public MasterPort
+    {
+      public:
+        DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+            : MasterPort(_name, _cu), computeUnit(_cu),
+              index(_index) { }
+
+        bool snoopRangeSent;
+
+        struct SenderState : public Packet::SenderState
+        {
+            GPUDynInstPtr _gpuDynInst;
+            int port_index;
+            Packet::SenderState *saved;
+
+            SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
+                        Packet::SenderState *sender_state=nullptr)
+                : _gpuDynInst(gpuDynInst),
+                  port_index(_port_index),
+                  saved(sender_state) { }
+        };
+
+        class MemReqEvent : public Event
+        {
+          private:
+            DataPort *dataPort;
+            PacketPtr pkt;
+
+          public:
+            MemReqEvent(DataPort *_data_port, PacketPtr _pkt)
+                : Event(), dataPort(_data_port), pkt(_pkt)
+            {
+              setFlags(Event::AutoDelete);
+            }
+
+            void process();
+            const char *description() const;
+        };
+
+        class MemRespEvent : public Event
+        {
+          private:
+            DataPort *dataPort;
+            PacketPtr pkt;
+
+          public:
+            MemRespEvent(DataPort *_data_port, PacketPtr _pkt)
+                : Event(), dataPort(_data_port), pkt(_pkt)
+            {
+              setFlags(Event::AutoDelete);
+            }
+
+            void process();
+            const char *description() const;
+        };
+
+        std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
+
+      protected:
+        ComputeUnit *computeUnit;
+        int index;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt) { }
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+
+        virtual void
+        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
+        {
+            resp.clear();
+            snoop = true;
+        }
+
+    };
+
+    // Instruction cache access port
+    class SQCPort : public MasterPort
+    {
+      public:
+        SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+            : MasterPort(_name, _cu), computeUnit(_cu),
+              index(_index) { }
+
+        bool snoopRangeSent;
+
+        struct SenderState : public Packet::SenderState
+        {
+            Wavefront *wavefront;
+            Packet::SenderState *saved;
+
+            SenderState(Wavefront *_wavefront, Packet::SenderState
+                    *sender_state=nullptr)
+                : wavefront(_wavefront), saved(sender_state) { }
+        };
+
+        std::deque<std::pair<PacketPtr, Wavefront*>> retries;
+
+      protected:
+        ComputeUnit *computeUnit;
+        int index;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt) { }
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+
+        virtual void
+        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
+        {
+            resp.clear();
+            snoop = true;
+        }
+     };
+
+    /** Data TLB port **/
+    class DTLBPort : public MasterPort
+    {
+      public:
+        DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+            : MasterPort(_name, _cu), computeUnit(_cu),
+              index(_index), stalled(false)
+        { }
+
+        bool isStalled() { return stalled; }
+        void stallPort() { stalled = true; }
+        void unstallPort() { stalled = false; }
+
+        /**
+         * here we queue all the translation requests that were
+         * not successfully sent.
+         */
+        std::deque<PacketPtr> retries;
+
+        /** SenderState is information carried along with the packet
+         * throughout the TLB hierarchy
+         */
+        struct SenderState: public Packet::SenderState
+        {
+            // the memInst that this is associated with
+            GPUDynInstPtr _gpuDynInst;
+
+            // the lane in the memInst this is associated with, so we send
+            // the memory request down the right port
+            int portIndex;
+
+            // constructor used for packets involved in timing accesses
+            SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
+                : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
+
+        };
+
+      protected:
+        ComputeUnit *computeUnit;
+        int index;
+        bool stalled;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt) { }
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+    };
+
+    class ITLBPort : public MasterPort
+    {
+      public:
+        ITLBPort(const std::string &_name, ComputeUnit *_cu)
+            : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
+
+
+        bool isStalled() { return stalled; }
+        void stallPort() { stalled = true; }
+        void unstallPort() { stalled = false; }
+
+        /**
+         * here we queue all the translation requests that were
+         * not successfully sent.
+         */
+        std::deque<PacketPtr> retries;
+
+        /** SenderState is information carried along with the packet
+         * throughout the TLB hierarchy
+         */
+        struct SenderState: public Packet::SenderState
+        {
+            // The wavefront associated with this request
+            Wavefront *wavefront;
+
+            SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
+        };
+
+      protected:
+        ComputeUnit *computeUnit;
+        bool stalled;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt) { }
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+    };
+
+    /**
+     * the port intended to communicate between the CU and its LDS
+     */
+    class LDSPort : public MasterPort
+    {
+      public:
+        LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
+        : MasterPort(_name, _cu, _id), computeUnit(_cu)
+        {
+        }
+
+        bool isStalled() const { return stalled; }
+        void stallPort() { stalled = true; }
+        void unstallPort() { stalled = false; }
+
+        /**
+         * here we queue all the requests that were
+         * not successfully sent.
+         */
+        std::queue<PacketPtr> retries;
+
+        /**
+         *  SenderState is information carried along with the packet, esp. the
+         *  GPUDynInstPtr
+         */
+        class SenderState: public Packet::SenderState
+        {
+          protected:
+            // The actual read/write/atomic request that goes with this command
+            GPUDynInstPtr _gpuDynInst = nullptr;
+
+          public:
+            SenderState(GPUDynInstPtr gpuDynInst):
+              _gpuDynInst(gpuDynInst)
+            {
+            }
+
+            GPUDynInstPtr
+            getMemInst() const
+            {
+              return _gpuDynInst;
+            }
+        };
+
+        virtual bool
+        sendTimingReq(PacketPtr pkt);
+
+      protected:
+
+        bool stalled = false; ///< whether or not it is stalled
+
+        ComputeUnit *computeUnit;
+
+        virtual bool
+        recvTimingResp(PacketPtr pkt);
+
+        virtual Tick
+        recvAtomic(PacketPtr pkt) { return 0; }
+
+        virtual void
+        recvFunctional(PacketPtr pkt)
+        {
+        }
+
+        virtual void
+        recvRangeChange()
+        {
+        }
+
+        virtual void
+        recvReqRetry();
+    };
+
+    /** The port to access the Local Data Store
+     *  Can be connected to a LDS object
+     */
+    LDSPort *ldsPort = nullptr;
+
+    LDSPort *
+    getLdsPort() const
+    {
+        return ldsPort;
+    }
+
+    /** The memory port for SIMD data accesses.
+     *  Can be connected to PhysMem for Ruby for timing simulations
+     */
+    std::vector<DataPort*> memPort;
+    // port to the TLB hierarchy (i.e., the L1 TLB)
+    std::vector<DTLBPort*> tlbPort;
+    // port to the SQC (i.e. the I-cache)
+    SQCPort *sqcPort;
+    // port to the SQC TLB (there's a separate TLB for each I-cache)
+    ITLBPort *sqcTLBPort;
+
+    virtual BaseMasterPort&
+    getMasterPort(const std::string &if_name, PortID idx)
+    {
+        if (if_name == "memory_port") {
+            memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
+                                        this, idx);
+            return *memPort[idx];
+        } else if (if_name == "translation_port") {
+            tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
+                                        this, idx);
+            return *tlbPort[idx];
+        } else if (if_name == "sqc_port") {
+            sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
+                                  this, idx);
+            return *sqcPort;
+        } else if (if_name == "sqc_tlb_port") {
+            sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
+            return *sqcTLBPort;
+        } else if (if_name == "ldsPort") {
+            if (ldsPort) {
+                fatal("an LDS port was already allocated");
+            }
+            ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
+            return *ldsPort;
+        } else {
+            panic("incorrect port name");
+        }
+    }
+
+    // xact_cas_load()
+    class waveIdentifier
+    {
+      public:
+        waveIdentifier() { }
+        waveIdentifier(int _simdId, int _wfSlotId)
+          : simdId(_simdId), wfSlotId(_wfSlotId) { }
+
+        int simdId;
+        int wfSlotId;
+    };
+
+    class waveQueue
+    {
+      public:
+        std::list<waveIdentifier> waveIDQueue;
+    };
+    std::map<unsigned, waveQueue> xactCasLoadMap;
+
+    uint64_t getAndIncSeqNum() { return globalSeqNum++; }
+
+  private:
+    uint64_t globalSeqNum;
+    int wavefrontSize;
+};
+
+#endif // __COMPUTE_UNIT_HH__
diff --git a/src/gpu-compute/condition_register_state.cc b/src/gpu-compute/condition_register_state.cc
new file mode 100644
index 000000000..f3f2d2927
--- /dev/null
+++ b/src/gpu-compute/condition_register_state.cc
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/condition_register_state.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+
+ConditionRegisterState::ConditionRegisterState()
+{
+    computeUnit = nullptr;
+    c_reg.clear();
+    busy.clear();
+}
+
+void
+ConditionRegisterState::setParent(ComputeUnit *_computeUnit)
+{
+    computeUnit = _computeUnit;
+    _name = computeUnit->name() + ".CondRegState";
+}
+
+void
+ConditionRegisterState::init(uint32_t _size)
+{
+    c_reg.resize(_size);
+    busy.resize(_size, 0);
+}
+
+void
+ConditionRegisterState::exec(GPUStaticInst *ii, Wavefront *w)
+{
+    // iterate over all operands
+    for (auto i = 0; i < ii->getNumOperands(); ++i) {
+        // is this a condition register destination operand?
+        if (ii->isCondRegister(i) && ii->isDstOperand(i)) {
+            // mark the register as busy
+            markReg(ii->getRegisterIndex(i), 1);
+            uint32_t pipeLen =  w->computeUnit->spBypassLength();
+
+            // schedule an event for marking the register as ready
+            w->computeUnit->
+                registerEvent(w->simdId, ii->getRegisterIndex(i),
+                              ii->getOperandSize(i),
+                              w->computeUnit->shader->tick_cnt +
+                              w->computeUnit->shader->ticks(pipeLen), 0);
+        }
+    }
+}
diff --git a/src/gpu-compute/condition_register_state.hh b/src/gpu-compute/condition_register_state.hh
new file mode 100644
index 000000000..139874a66
--- /dev/null
+++ b/src/gpu-compute/condition_register_state.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __CONDITION_REGISTER_STATE_HH__
+#define __CONDITION_REGISTER_STATE_HH__
+
+#include <string>
+#include <vector>
+
+#include "gpu-compute/misc.hh"
+
+class ComputeUnit;
+class GPUStaticInst;
+class Shader;
+class Wavefront;
+
+// Condition Register State (used only when executing HSAIL)
+class ConditionRegisterState
+{
+  public:
+    ConditionRegisterState();
+    void init(uint32_t _size);
+    const std::string name() const { return _name; }
+    void setParent(ComputeUnit *_computeUnit);
+    void regStats() { }
+
+    template<typename T>
+    T
+    read(int regIdx, int threadId)
+    {
+        bool tmp = c_reg[regIdx][threadId];
+        T *p0 = (T*)(&tmp);
+
+        return *p0;
+    }
+
+    template<typename T>
+    void
+    write(int regIdx, int threadId, T value)
+    {
+        c_reg[regIdx][threadId] = (bool)(value & 0x01);
+    }
+
+    void
+    markReg(int regIdx, uint8_t value)
+    {
+        busy.at(regIdx) = value;
+    }
+
+    uint8_t
+    regBusy(int idx)
+    {
+        uint8_t status = busy.at(idx);
+        return status;
+    }
+
+    int numRegs() { return c_reg.size(); }
+    void exec(GPUStaticInst *ii, Wavefront *w);
+
+  private:
+    ComputeUnit* computeUnit;
+    std::string _name;
+    // Condition Register state
+    std::vector<VectorMask> c_reg;
+    // flag indicating if a register is busy
+    std::vector<uint8_t> busy;
+};
+
+#endif
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
new file mode 100644
index 000000000..55e4be72a
--- /dev/null
+++ b/src/gpu-compute/dispatcher.cc
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Marc Orr
+ */
+
+
+#include "gpu-compute/dispatcher.hh"
+
+#include "cpu/base.hh"
+#include "debug/GPUDisp.hh"
+#include "gpu-compute/cl_driver.hh"
+#include "gpu-compute/cl_event.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/packet_access.hh"
+
+GpuDispatcher *GpuDispatcher::instance = nullptr;
+
+GpuDispatcher::GpuDispatcher(const Params *p)
+    : DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")),
+      pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
+      dispatchCount(0), dispatchActive(false), cpu(p->cpu),
+      shader(p->shader_pointer), driver(p->cl_driver), tickEvent(this)
+{
+    shader->handshake(this);
+    driver->handshake(this);
+
+    ndRange.wg_disp_rem = false;
+    ndRange.globalWgId = 0;
+
+    schedule(&tickEvent, 0);
+
+    // translation port for the dispatcher
+    tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
+
+    num_kernelLaunched
+    .name(name() + ".num_kernel_launched")
+    .desc("number of kernel launched")
+    ;
+}
+
+GpuDispatcher *GpuDispatcherParams::create()
+{
+    GpuDispatcher *dispatcher = new GpuDispatcher(this);
+    GpuDispatcher::setInstance(dispatcher);
+
+    return GpuDispatcher::getInstance();
+}
+
+void
+GpuDispatcher::serialize(CheckpointOut &cp) const
+{
+    Tick event_tick = 0;
+
+    if (ndRange.wg_disp_rem)
+        fatal("Checkpointing not supported during active workgroup execution");
+
+    if (tickEvent.scheduled())
+        event_tick = tickEvent.when();
+
+    SERIALIZE_SCALAR(event_tick);
+
+}
+
+void
+GpuDispatcher::unserialize(CheckpointIn &cp)
+{
+    Tick event_tick;
+
+    if (tickEvent.scheduled())
+        deschedule(&tickEvent);
+
+    UNSERIALIZE_SCALAR(event_tick);
+
+    if (event_tick)
+        schedule(&tickEvent, event_tick);
+}
+
+AddrRangeList
+GpuDispatcher::getAddrRanges() const
+{
+    AddrRangeList ranges;
+
+    DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
+            pioAddr, pioSize);
+
+    ranges.push_back(RangeSize(pioAddr, pioSize));
+
+    return ranges;
+}
+
+Tick
+GpuDispatcher::read(PacketPtr pkt)
+{
+    assert(pkt->getAddr() >= pioAddr);
+    assert(pkt->getAddr() < pioAddr + pioSize);
+
+    int offset = pkt->getAddr() - pioAddr;
+    pkt->allocate();
+
+    DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
+
+    if (offset < 8) {
+        assert(!offset);
+        assert(pkt->getSize() == 8);
+
+        uint64_t retval = dispatchActive;
+        pkt->set(retval);
+    } else {
+        offset -= 8;
+        assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
+        char *curTaskPtr = (char*)&curTask;
+
+        memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
+    }
+
+    pkt->makeAtomicResponse();
+
+    return pioDelay;
+}
+
+Tick
+GpuDispatcher::write(PacketPtr pkt)
+{
+    assert(pkt->getAddr() >= pioAddr);
+    assert(pkt->getAddr() < pioAddr + pioSize);
+
+    int offset = pkt->getAddr() - pioAddr;
+
+#if TRACING_ON
+    uint64_t data_val = 0;
+
+    switch (pkt->getSize()) {
+      case 1:
+        data_val = pkt->get<uint8_t>();
+        break;
+      case 2:
+        data_val = pkt->get<uint16_t>();
+        break;
+      case 4:
+        data_val = pkt->get<uint32_t>();
+        break;
+      case 8:
+        data_val = pkt->get<uint64_t>();
+        break;
+      default:
+        DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
+    }
+
+    DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
+            pkt->getSize());
+#endif
+    if (!offset) {
+        static int nextId = 0;
+
+        // The depends field of the qstruct, which was previously unused, is
+        // used to communicate with simulated application.
+        if (curTask.depends) {
+            HostState hs;
+            shader->ReadMem((uint64_t)(curTask.depends), &hs,
+                            sizeof(HostState), 0);
+
+            // update event start time (in nano-seconds)
+            uint64_t start = curTick() / 1000;
+
+            shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
+                             &start, sizeof(uint64_t), 0);
+        }
+
+        // launch kernel
+        ++num_kernelLaunched;
+
+        NDRange *ndr = &(ndRangeMap[nextId]);
+        // copy dispatch info
+        ndr->q = curTask;
+
+        // update the numDispTask polled by the runtime
+        accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
+
+        ndr->numWgTotal = 1;
+
+        for (int i = 0; i < 3; ++i) {
+            ndr->wgId[i] = 0;
+            ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
+            ndr->numWgTotal *= ndr->numWg[i];
+        }
+
+        ndr->numWgCompleted = 0;
+        ndr->globalWgId = 0;
+        ndr->wg_disp_rem = true;
+        ndr->execDone = false;
+        ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
+        ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
+        ndr->dispatchId = nextId;
+        ndr->curTid = pkt->req->threadId();
+        DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
+        execIds.push(nextId);
+        ++nextId;
+
+        dispatchActive = true;
+
+        if (!tickEvent.scheduled()) {
+            schedule(&tickEvent, curTick() + shader->ticks(1));
+        }
+    } else {
+        // populate current task struct
+        // first 64 bits are launch reg
+        offset -= 8;
+        assert(offset < sizeof(HsaQueueEntry));
+        char *curTaskPtr = (char*)&curTask;
+        memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
+    }
+
+    pkt->makeAtomicResponse();
+
+    return pioDelay;
+}
+
+
+BaseMasterPort&
+GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "translation_port") {
+        return *tlbPort;
+    }
+
+    return DmaDevice::getMasterPort(if_name, idx);
+}
+
+void
+GpuDispatcher::exec()
+{
+    int fail_count = 0;
+
+    // There are potentially multiple outstanding kernel launches.
+    // It is possible that the workgroups in a different kernel
+    // can fit on the GPU even if another kernel's workgroups cannot
+    DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
+
+    while (execIds.size() > fail_count) {
+        int execId = execIds.front();
+
+        while (ndRangeMap[execId].wg_disp_rem) {
+            //update the thread context
+            shader->updateThreadContext(ndRangeMap[execId].curTid);
+
+            // attempt to dispatch_workgroup
+            if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
+                // if we failed try the next kernel,
+                // it may have smaller workgroups.
+                // put it on the queue to rety latter
+                DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
+                execIds.push(execId);
+                ++fail_count;
+                break;
+            }
+        }
+        // let's try the next kernel_id
+        execIds.pop();
+    }
+
+    DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
+
+    if (doneIds.size() && cpu) {
+        shader->hostWakeUp(cpu);
+    }
+
+    while (doneIds.size()) {
+        // wakeup the CPU if any Kernels completed this cycle
+        DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
+        doneIds.pop();
+    }
+}
+
+void
+GpuDispatcher::notifyWgCompl(Wavefront *w)
+{
+    int kern_id = w->kern_id;
+    DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
+    assert(ndRangeMap[kern_id].dispatchId == kern_id);
+    ndRangeMap[kern_id].numWgCompleted++;
+
+    if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
+        ndRangeMap[kern_id].execDone = true;
+        doneIds.push(kern_id);
+
+        if (ndRangeMap[kern_id].addrToNotify) {
+            accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
+                          0);
+        }
+
+        accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
+
+        // update event end time (in nano-seconds)
+        if (ndRangeMap[kern_id].q.depends) {
+            HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
+            uint64_t event;
+            shader->ReadMem((uint64_t)(&host_state->event), &event,
+                            sizeof(uint64_t), 0);
+
+            uint64_t end = curTick() / 1000;
+
+            shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
+                             sizeof(uint64_t), 0);
+        }
+    }
+
+    if (!tickEvent.scheduled()) {
+        schedule(&tickEvent, curTick() + shader->ticks(1));
+    }
+}
+
+void
+GpuDispatcher::scheduleDispatch()
+{
+    if (!tickEvent.scheduled())
+        schedule(&tickEvent, curTick() + shader->ticks(1));
+}
+
+void
+GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
+{
+    if (cpu) {
+        if (off) {
+            shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
+                              true);
+            val += off;
+        }
+
+        shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
+    } else {
+        panic("Cannot find host");
+    }
+}
+
+GpuDispatcher::TickEvent::TickEvent(GpuDispatcher *_dispatcher)
+    : Event(CPU_Tick_Pri), dispatcher(_dispatcher)
+{
+}
+
+void
+GpuDispatcher::TickEvent::process()
+{
+    dispatcher->exec();
+}
+
+const char*
+GpuDispatcher::TickEvent::description() const
+{
+    return "GPU Dispatcher tick";
+}
+
+// helper functions for driver to retrieve GPU attributes
+int
+GpuDispatcher::getNumCUs()
+{
+    return shader->cuList.size();
+}
+
+void
+GpuDispatcher::setFuncargsSize(int funcargs_size)
+{
+    shader->funcargs_size = funcargs_size;
+}
diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh
new file mode 100644
index 000000000..76f932655
--- /dev/null
+++ b/src/gpu-compute/dispatcher.hh
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Marc Orr
+ */
+
+#ifndef __GPU_DISPATCHER_HH__
+#define __GPU_DISPATCHER_HH__
+
+#include <queue>
+#include <vector>
+
+#include "base/statistics.hh"
+#include "dev/dma_device.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/ndrange.hh"
+#include "gpu-compute/qstruct.hh"
+#include "mem/port.hh"
+#include "params/GpuDispatcher.hh"
+
+class BaseCPU;
+class Shader;
+
+class GpuDispatcher : public DmaDevice
+{
+    public:
+        typedef GpuDispatcherParams Params;
+
+        class TickEvent : public Event
+        {
+            private:
+                GpuDispatcher *dispatcher;
+
+            public:
+                TickEvent(GpuDispatcher *);
+                void process();
+                const char *description() const;
+        };
+
+        MasterID masterId() { return _masterId; }
+
+    protected:
+        MasterID _masterId;
+
+        // Base and length of PIO register space
+        Addr pioAddr;
+        Addr pioSize;
+        Tick pioDelay;
+
+        HsaQueueEntry curTask;
+
+        std::unordered_map<int, NDRange> ndRangeMap;
+        NDRange ndRange;
+
+        // list of kernel_ids to launch
+        std::queue<int> execIds;
+        // list of kernel_ids that have finished
+        std::queue<int> doneIds;
+
+        uint64_t dispatchCount;
+        // is there a kernel in execution?
+        bool dispatchActive;
+
+        BaseCPU *cpu;
+        Shader *shader;
+        ClDriver *driver;
+        TickEvent tickEvent;
+
+        static GpuDispatcher *instance;
+
+        // sycall emulation mode can have only 1 application running(?)
+        // else we have to do some pid based tagging
+        // unused
+        typedef std::unordered_map<uint64_t, uint64_t> TranslationBuffer;
+        TranslationBuffer tlb;
+
+    public:
+        /*statistics*/
+        Stats::Scalar num_kernelLaunched;
+        GpuDispatcher(const Params *p);
+
+        ~GpuDispatcher() { }
+
+        void exec();
+        virtual void serialize(CheckpointOut &cp) const;
+        virtual void unserialize(CheckpointIn &cp);
+        void notifyWgCompl(Wavefront *w);
+        void scheduleDispatch();
+        void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off);
+
+        // using singleton so that glue code can pass pointer locations
+        // to the dispatcher. when there are multiple dispatchers, we can
+        // call something like getInstance(index)
+        static void
+         setInstance(GpuDispatcher *_instance)
+        {
+            instance = _instance;
+        }
+
+        static GpuDispatcher* getInstance() { return instance; }
+
+        class TLBPort : public MasterPort
+        {
+          public:
+
+            TLBPort(const std::string &_name, GpuDispatcher *_dispatcher)
+                : MasterPort(_name, _dispatcher), dispatcher(_dispatcher) { }
+
+          protected:
+            GpuDispatcher *dispatcher;
+
+            virtual bool recvTimingResp(PacketPtr pkt) { return true; }
+            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+            virtual void recvFunctional(PacketPtr pkt) { }
+            virtual void recvRangeChange() { }
+            virtual void recvReqRetry() { }
+
+        };
+
+        TLBPort *tlbPort;
+
+        virtual BaseMasterPort& getMasterPort(const std::string &if_name,
+                                              PortID idx);
+
+        AddrRangeList getAddrRanges() const;
+        Tick read(PacketPtr pkt);
+        Tick write(PacketPtr pkt);
+
+        // helper functions to retrieve/set GPU attributes
+        int getNumCUs();
+        void setFuncargsSize(int funcargs_size);
+};
+
+#endif // __GPU_DISPATCHER_HH__
diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc
new file mode 100644
index 000000000..c2b95f85e
--- /dev/null
+++ b/src/gpu-compute/exec_stage.cc
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#include "gpu-compute/exec_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/wavefront.hh"
+
+ExecStage::ExecStage(const ComputeUnitParams *p) : numSIMDs(p->num_SIMDs),
+    numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
+    vectorAluInstAvail(nullptr), glbMemInstAvail(nullptr),
+    shrMemInstAvail(nullptr), lastTimeInstExecuted(false),
+    thisTimeInstExecuted(false), instrExecuted (false),
+    executionResourcesUsed(0)
+{
+    numTransActiveIdle = 0;
+    idle_dur = 0;
+}
+
+void
+ExecStage::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    _name = computeUnit->name() + ".ExecStage";
+    dispatchList = &computeUnit->dispatchList;
+    vectorAluInstAvail = &(computeUnit->vectorAluInstAvail);
+    glbMemInstAvail= &(computeUnit->glbMemInstAvail);
+    shrMemInstAvail= &(computeUnit->shrMemInstAvail);
+    idle_dur = 0;
+}
+
+void
+ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
+    if (stage == IdleExec) {
+        // count cycles of no vector ALU instruction executed
+        // even if one was the oldest in a WV of that vector SIMD unit
+        if (computeUnit->isVecAlu(unitId) && vectorAluInstAvail->at(unitId)) {
+            numCyclesWithNoInstrTypeIssued[unitId]++;
+        }
+
+        // count cycles of no global memory (vector) instruction executed
+        // even if one was the oldest in a WV of that vector SIMD unit
+        if (computeUnit->isGlbMem(unitId) && *glbMemInstAvail > 0) {
+            numCyclesWithNoInstrTypeIssued[unitId]++;
+            (*glbMemInstAvail)--;
+        }
+
+        // count cycles of no shared memory (vector) instruction executed
+        // even if one was the oldest in a WV of that vector SIMD unit
+        if (computeUnit->isShrMem(unitId) && *shrMemInstAvail > 0) {
+            numCyclesWithNoInstrTypeIssued[unitId]++;
+            (*shrMemInstAvail)--;
+        }
+    } else if (stage == BusyExec) {
+        // count the number of cycles an instruction to a specific unit
+        // was issued
+        numCyclesWithInstrTypeIssued[unitId]++;
+        thisTimeInstExecuted = true;
+        instrExecuted = true;
+        ++executionResourcesUsed;
+    } else if (stage == PostExec) {
+        // count the number of transitions from active to idle
+        if (lastTimeInstExecuted && !thisTimeInstExecuted) {
+            ++numTransActiveIdle;
+        }
+
+        if (!lastTimeInstExecuted && thisTimeInstExecuted) {
+            idleDur.sample(idle_dur);
+            idle_dur = 0;
+        } else if (!thisTimeInstExecuted) {
+            idle_dur++;
+        }
+
+        lastTimeInstExecuted = thisTimeInstExecuted;
+        // track the number of cycles we either issued one vector instruction
+        // or issued no instructions at all
+        if (instrExecuted) {
+            numCyclesWithInstrIssued++;
+        } else {
+            numCyclesWithNoIssue++;
+        }
+
+        spc.sample(executionResourcesUsed);
+    }
+}
+
+void
+ExecStage::initStatistics()
+{
+    instrExecuted = false;
+    executionResourcesUsed = 0;
+    thisTimeInstExecuted = false;
+}
+
+void
+ExecStage::exec()
+{
+    initStatistics();
+
+    for (int unitId = 0; unitId < (numSIMDs + numMemUnits); ++unitId) {
+         // if dispatch list for this execution resource is empty,
+         // skip this execution resource this cycle
+         if (dispatchList->at(unitId).second == EMPTY) {
+             collectStatistics(IdleExec, unitId);
+             continue;
+         }
+
+         collectStatistics(BusyExec, unitId);
+         // execute an instruction for the WF
+         dispatchList->at(unitId).first->exec();
+         // clear the dispatch list entry
+         dispatchList->at(unitId).second = EMPTY;
+         dispatchList->at(unitId).first = (Wavefront*)nullptr;
+    }
+
+    collectStatistics(PostExec, 0);
+}
+
+void
+ExecStage::regStats()
+{
+    numTransActiveIdle
+       .name(name() + ".num_transitions_active_to_idle")
+       .desc("number of CU transitions from active to idle")
+        ;
+
+    numCyclesWithNoIssue
+        .name(name() + ".num_cycles_with_no_issue")
+        .desc("number of cycles the CU issues nothing")
+        ;
+
+    numCyclesWithInstrIssued
+        .name(name() + ".num_cycles_with_instr_issued")
+        .desc("number of cycles the CU issued at least one instruction")
+        ;
+
+    spc
+        .init(0, numSIMDs + numMemUnits, 1)
+        .name(name() + ".spc")
+        .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
+        ;
+
+    idleDur
+        .init(0,75,5)
+        .name(name() + ".idle_duration_in_cycles")
+        .desc("duration of idle periods in cycles")
+        ;
+
+    numCyclesWithInstrTypeIssued
+        .init(numSIMDs + numMemUnits)
+        .name(name() + ".num_cycles_with_instrtype_issue")
+        .desc("Number of cycles at least one instruction of specific type "
+              "issued")
+        ;
+
+    numCyclesWithNoInstrTypeIssued
+        .init(numSIMDs + numMemUnits)
+       .name(name() + ".num_cycles_with_instr_type_no_issue")
+       .desc("Number of cycles no instruction of specific type issued")
+       ;
+
+    for (int i = 0; i < numSIMDs; ++i) {
+        numCyclesWithInstrTypeIssued.subname(i, csprintf("ALU%d",i));
+        numCyclesWithNoInstrTypeIssued.subname(i, csprintf("ALU%d",i));
+    }
+
+    numCyclesWithInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
+    numCyclesWithNoInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
+    numCyclesWithInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
+    numCyclesWithNoInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
+}
diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh
new file mode 100644
index 000000000..2de74366b
--- /dev/null
+++ b/src/gpu-compute/exec_stage.hh
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#ifndef __EXEC_STAGE_HH__
+#define __EXEC_STAGE_HH__
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "sim/stats.hh"
+
+class ComputeUnit;
+class Wavefront;
+struct ComputeUnitParams;
+
+enum STAT_STATUS
+{
+    IdleExec,
+    BusyExec,
+    PostExec
+};
+
+enum DISPATCH_STATUS
+{
+    EMPTY = 0,
+    FILLED
+};
+
+// Execution stage.
+// Each execution resource executes the
+// wave which is in its dispatch list.
+// The schedule stage is responsible for
+// adding a wave into each execution resource's
+// dispatch list.
+
+class ExecStage
+{
+  public:
+    ExecStage(const ComputeUnitParams* params);
+    ~ExecStage() { }
+    void init(ComputeUnit *cu);
+    void exec();
+
+    std::string name() { return _name; }
+    void regStats();
+    // number of idle cycles
+    Stats::Scalar numCyclesWithNoIssue;
+    // number of busy cycles
+    Stats::Scalar numCyclesWithInstrIssued;
+    // number of cycles (per execution unit) during which at least one
+    // instruction was issued to that unit
+    Stats::Vector numCyclesWithInstrTypeIssued;
+    // number of idle cycles (per execution unit) during which the unit issued
+    // no instruction targeting that unit, even though there is at least one
+    // Wavefront with such an instruction as the oldest
+    Stats::Vector numCyclesWithNoInstrTypeIssued;
+    // SIMDs active per cycle
+    Stats::Distribution spc;
+
+  private:
+    void collectStatistics(enum STAT_STATUS stage, int unitId);
+    void initStatistics();
+    ComputeUnit *computeUnit;
+    uint32_t numSIMDs;
+
+    // Number of memory execution resources;
+    // both global and local memory execution resources in CU
+    uint32_t numMemUnits;
+
+    // List of waves which will be dispatched to
+    // each execution resource. A FILLED implies
+    // dispatch list is non-empty and
+    // execution unit has something to execute
+    // this cycle. Currently, the dispatch list of
+    // an execution resource can hold only one wave because
+    // an execution resource can execute only one wave in a cycle.
+    // dispatchList is used to communicate between schedule
+    // and exec stage
+    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
+    // flag per vector SIMD unit that is set when there is at least one
+    // WV that has a vector ALU instruction as the oldest in its
+    // Instruction Buffer
+    std::vector<bool> *vectorAluInstAvail;
+    int *glbMemInstAvail;
+    int *shrMemInstAvail;
+    bool lastTimeInstExecuted;
+    bool thisTimeInstExecuted;
+    bool instrExecuted;
+    Stats::Scalar  numTransActiveIdle;
+    Stats::Distribution idleDur;
+    uint32_t executionResourcesUsed;
+    uint64_t idle_dur;
+    std::string _name;
+};
+
+#endif // __EXEC_STAGE_HH__
diff --git a/src/gpu-compute/fetch_stage.cc b/src/gpu-compute/fetch_stage.cc
new file mode 100644
index 000000000..1f5e6ded3
--- /dev/null
+++ b/src/gpu-compute/fetch_stage.cc
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez, Sooraj Puthoor
+ */
+
+#include "gpu-compute/fetch_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/wavefront.hh"
+
+FetchStage::FetchStage(const ComputeUnitParams* p) : numSIMDs(p->num_SIMDs),
+    computeUnit(nullptr)
+{
+    for (int j = 0; j < numSIMDs; ++j) {
+        FetchUnit newFetchUnit(p);
+        fetchUnit.push_back(newFetchUnit);
+    }
+}
+
+FetchStage::~FetchStage()
+{
+    fetchUnit.clear();
+}
+
+void
+FetchStage::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    _name = computeUnit->name() + ".FetchStage";
+
+    for (int j = 0; j < numSIMDs; ++j) {
+        fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
+        fetchUnit[j].init(computeUnit);
+    }
+}
+
+void
+FetchStage::exec()
+{
+    for (int j = 0; j < numSIMDs; ++j) {
+        fetchUnit[j].exec();
+    }
+}
+
+void
+FetchStage::processFetchReturn(PacketPtr pkt)
+{
+    ComputeUnit::SQCPort::SenderState *sender_state =
+        safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
+
+    Wavefront *wavefront = sender_state->wavefront;
+
+    const unsigned num_instructions = pkt->req->getSize() /
+        sizeof(TheGpuISA::RawMachInst);
+
+    instFetchInstReturned.sample(num_instructions);
+    uint32_t simdId = wavefront->simdId;
+    fetchUnit[simdId].processFetchReturn(pkt);
+}
+
+void
+FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
+{
+    fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
+}
+
+void
+FetchStage::regStats()
+{
+    instFetchInstReturned
+        .init(1, 32, 1)
+        .name(name() + ".inst_fetch_instr_returned")
+        .desc("For each instruction fetch request recieved record how many "
+              "instructions you got from it")
+        ;
+}
diff --git a/src/gpu-compute/fetch_stage.hh b/src/gpu-compute/fetch_stage.hh
new file mode 100644
index 000000000..ce7faa8ac
--- /dev/null
+++ b/src/gpu-compute/fetch_stage.hh
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez, Sooraj Puthoor
+ */
+
+#ifndef __FETCH_STAGE_HH__
+#define __FETCH_STAGE_HH__
+
+#include <string>
+#include <vector>
+
+#include "gpu-compute/fetch_unit.hh"
+
+// Instruction fetch stage.
+// All dispatched wavefronts for all SIMDS are analyzed for the
+// need to fetch instructions. From the fetch eligible waves,
+// one wave is selected from each SIMD and fetch is initiated
+// for the selected waves.
+
+class ComputeUnit;
+class Wavefront;
+
+class FetchStage
+{
+  public:
+    FetchStage(const ComputeUnitParams* params);
+    ~FetchStage();
+    void init(ComputeUnit *cu);
+    void exec();
+    void processFetchReturn(PacketPtr pkt);
+    void fetch(PacketPtr pkt, Wavefront *wave);
+
+    // Stats related variables and methods
+    std::string name() { return _name; }
+    void regStats();
+    Stats::Distribution instFetchInstReturned;
+
+  private:
+    uint32_t numSIMDs;
+    ComputeUnit *computeUnit;
+
+    // List of fetch units. A fetch unit is
+    // instantiated per SIMD
+    std::vector<FetchUnit> fetchUnit;
+    std::string _name;
+};
+
+#endif // __FETCH_STAGE_HH__
diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc
new file mode 100644
index 000000000..1f0a7d78e
--- /dev/null
+++ b/src/gpu-compute/fetch_unit.cc
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Sooraj Puthoor
+ */
+
+#include "gpu-compute/fetch_unit.hh"
+
+#include "debug/GPUFetch.hh"
+#include "debug/GPUPort.hh"
+#include "debug/GPUTLB.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/ruby/system/RubySystem.hh"
+
+uint32_t FetchUnit::globalFetchUnitID;
+
+FetchUnit::FetchUnit(const ComputeUnitParams* params) :
+    timingSim(true),
+    computeUnit(nullptr),
+    fetchScheduler(params),
+    waveList(nullptr)
+{
+}
+
+FetchUnit::~FetchUnit()
+{
+    fetchQueue.clear();
+    fetchStatusQueue.clear();
+}
+
+void
+FetchUnit::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    timingSim = computeUnit->shader->timingSim;
+    fetchQueue.clear();
+    fetchStatusQueue.resize(computeUnit->shader->n_wf);
+
+    for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
+        fetchStatusQueue[j] = std::make_pair(waveList->at(j), false);
+    }
+
+    fetchScheduler.bindList(&fetchQueue);
+}
+
+void
+FetchUnit::exec()
+{
+    // re-evaluate waves which are marked as not ready for fetch
+    for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
+        // Following code assumes 64-bit opertaion and all insts are
+        // represented by 64-bit pointers to inst objects.
+        Wavefront *curWave = fetchStatusQueue[j].first;
+        assert (curWave);
+
+        // The wavefront has to be active, the IB occupancy has to be
+        // 4 or less instructions and it can not have any branches to
+        // prevent speculative instruction fetches
+        if (!fetchStatusQueue[j].second) {
+            if (curWave->status == Wavefront::S_RUNNING &&
+                curWave->instructionBuffer.size() <= 4 &&
+                !curWave->instructionBufferHasBranch() &&
+                !curWave->pendingFetch) {
+                fetchQueue.push_back(curWave);
+                fetchStatusQueue[j].second = true;
+            }
+        }
+    }
+
+    // Fetch only if there is some wave ready to be fetched
+    // An empty fetchQueue will cause the schedular to panic
+    if (fetchQueue.size()) {
+        Wavefront *waveToBeFetched = fetchScheduler.chooseWave();
+        waveToBeFetched->pendingFetch = true;
+        fetchStatusQueue[waveToBeFetched->wfSlotId].second = false;
+        initiateFetch(waveToBeFetched);
+    }
+}
+
+void
+FetchUnit::initiateFetch(Wavefront *wavefront)
+{
+    // calculate the virtual address to fetch from the SQC
+    Addr vaddr = wavefront->pc() + wavefront->instructionBuffer.size();
+    vaddr = wavefront->base_ptr +  vaddr * sizeof(GPUStaticInst*);
+
+    DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
+            computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
+
+    // Since this is an instruction prefetch, if you're split then just finish
+    // out the current line.
+    unsigned block_size = RubySystem::getBlockSizeBytes();
+    // check for split accesses
+    Addr split_addr = roundDown(vaddr + block_size - 1, block_size);
+    unsigned size = block_size;
+
+    if (split_addr > vaddr) {
+        // misaligned access, just grab the rest of the line
+        size = split_addr - vaddr;
+    }
+
+    // set up virtual request
+    Request *req = new Request(0, vaddr, size, Request::INST_FETCH,
+                               computeUnit->masterId(), 0, 0, 0);
+
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    // This fetchBlock is kind of faux right now - because the translations so
+    // far don't actually return Data
+    uint64_t fetchBlock;
+    pkt->dataStatic(&fetchBlock);
+
+    if (timingSim) {
+        // SenderState needed on Return
+        pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront);
+
+        // Sender State needed by TLB hierarchy
+        pkt->senderState =
+            new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
+                                                 computeUnit->shader->gpuTc,
+                                                 false, pkt->senderState);
+
+        if (computeUnit->sqcTLBPort->isStalled()) {
+            assert(computeUnit->sqcTLBPort->retries.size() > 0);
+
+            DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
+                    vaddr);
+
+            computeUnit->sqcTLBPort->retries.push_back(pkt);
+        } else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) {
+            // Stall the data port;
+            // No more packet is issued till
+            // ruby indicates resources are freed by
+            // a recvReqRetry() call back on this port.
+            computeUnit->sqcTLBPort->stallPort();
+
+            DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
+                    vaddr);
+
+            computeUnit->sqcTLBPort->retries.push_back(pkt);
+        } else {
+            DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr);
+        }
+    } else {
+        pkt->senderState =
+            new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
+                                                 computeUnit->shader->gpuTc);
+
+        computeUnit->sqcTLBPort->sendFunctional(pkt);
+
+        TheISA::GpuTLB::TranslationState *sender_state =
+             safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+        delete sender_state->tlbEntry;
+        delete sender_state;
+        // fetch the instructions from the SQC when we operate in
+        // functional mode only
+        fetch(pkt, wavefront);
+    }
+}
+
+void
+FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
+{
+    assert(pkt->req->hasPaddr());
+    assert(pkt->req->hasSize());
+
+    DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n",
+            computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+            pkt->req->getPaddr());
+
+    // this is necessary because the GPU TLB receives packets instead of
+    // requests. when the translation is complete, all relevent fields in the
+    // request will be populated, but not in the packet. here we create the
+    // new packet so we can set the size, addr, and proper flags.
+    PacketPtr oldPkt = pkt;
+    pkt = new Packet(oldPkt->req, oldPkt->cmd);
+    delete oldPkt;
+
+    TheGpuISA::RawMachInst *data =
+        new TheGpuISA::RawMachInst[pkt->req->getSize() /
+        sizeof(TheGpuISA::RawMachInst)];
+
+    pkt->dataDynamic<TheGpuISA::RawMachInst>(data);
+
+    // New SenderState for the memory access
+    pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
+
+    if (timingSim) {
+        // translation is done. Send the appropriate timing memory request.
+
+        if (!computeUnit->sqcPort->sendTimingReq(pkt)) {
+            computeUnit->sqcPort->retries.push_back(std::make_pair(pkt,
+                                                                   wavefront));
+
+            DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
+                    computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+                    pkt->req->getPaddr());
+        } else {
+            DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
+                    computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+                    pkt->req->getPaddr());
+        }
+    } else {
+        computeUnit->sqcPort->sendFunctional(pkt);
+        processFetchReturn(pkt);
+    }
+}
+
+void
+FetchUnit::processFetchReturn(PacketPtr pkt)
+{
+    ComputeUnit::SQCPort::SenderState *sender_state =
+        safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
+
+    Wavefront *wavefront = sender_state->wavefront;
+
+    DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
+            "%d bytes, %d instructions!\n", computeUnit->cu_id,
+            wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(),
+            pkt->req->getSize(), pkt->req->getSize() /
+            sizeof(TheGpuISA::RawMachInst));
+
+    if (wavefront->dropFetch) {
+        assert(wavefront->instructionBuffer.empty());
+        wavefront->dropFetch = false;
+    } else {
+        TheGpuISA::RawMachInst *inst_index_ptr =
+            (TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>();
+
+        assert(wavefront->instructionBuffer.size() <= 4);
+
+        for (int i = 0; i < pkt->req->getSize() /
+             sizeof(TheGpuISA::RawMachInst); ++i) {
+            GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]);
+
+            assert(inst_ptr);
+            DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n",
+                    computeUnit->cu_id, wavefront->simdId,
+                    wavefront->wfSlotId, inst_ptr->disassemble());
+
+            GPUDynInstPtr gpuDynInst =
+                std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr,
+                                             computeUnit->getAndIncSeqNum());
+
+            wavefront->instructionBuffer.push_back(gpuDynInst);
+        }
+    }
+
+    wavefront->pendingFetch = false;
+
+    delete pkt->senderState;
+    delete pkt->req;
+    delete pkt;
+}
+
+void
+FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
+{
+    waveList = wave_list;
+}
diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh
new file mode 100644
index 000000000..c7c6afb3c
--- /dev/null
+++ b/src/gpu-compute/fetch_unit.hh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Sooraj Puthoor
+ */
+
+#ifndef __FETCH_UNIT_HH__
+#define __FETCH_UNIT_HH__
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arch/gpu_decoder.hh"
+#include "base/statistics.hh"
+#include "config/the_gpu_isa.hh"
+#include "gpu-compute/scheduler.hh"
+#include "mem/packet.hh"
+
+class ComputeUnit;
+class Wavefront;
+
+class FetchUnit
+{
+  public:
+    FetchUnit(const ComputeUnitParams* params);
+    ~FetchUnit();
+    void init(ComputeUnit *cu);
+    void exec();
+    void bindWaveList(std::vector<Wavefront*> *list);
+    void initiateFetch(Wavefront *wavefront);
+    void fetch(PacketPtr pkt, Wavefront *wavefront);
+    void processFetchReturn(PacketPtr pkt);
+    static uint32_t globalFetchUnitID;
+
+  private:
+    bool timingSim;
+    ComputeUnit *computeUnit;
+    TheGpuISA::Decoder decoder;
+
+    // Fetch scheduler; Selects one wave from
+    // the fetch queue for instruction fetching.
+    // The selection is made according to
+    // a scheduling policy
+    Scheduler fetchScheduler;
+
+    // Stores the list of waves that are
+    // ready to be fetched this cycle
+    std::vector<Wavefront*> fetchQueue;
+
+    // Stores the fetch status of all waves dispatched to this SIMD.
+    // TRUE implies the wave is ready to fetch and is already
+    // moved to fetchQueue
+    std::vector<std::pair<Wavefront*, bool>> fetchStatusQueue;
+
+    // Pointer to list of waves dispatched on to this SIMD unit
+    std::vector<Wavefront*> *waveList;
+};
+
+#endif // __FETCH_UNIT_HH__
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
new file mode 100644
index 000000000..913327412
--- /dev/null
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#include "gpu-compute/global_memory_pipeline.hh"
+
+#include "debug/GPUMem.hh"
+#include "debug/GPUReg.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
+    computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
+    inflightStores(0), inflightLoads(0)
+{
+}
+
+void
+GlobalMemPipeline::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    globalMemSize = computeUnit->shader->globalMemSize;
+    _name = computeUnit->name() + ".GlobalMemPipeline";
+}
+
+void
+GlobalMemPipeline::exec()
+{
+    // apply any returned global memory operations
+    GPUDynInstPtr m = !gmReturnedLoads.empty() ? gmReturnedLoads.front() :
+        !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr;
+
+    bool accessVrf = true;
+    // check the VRF to see if the operands of a load (or load component
+    // of an atomic) are accessible
+    if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) {
+        Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+        accessVrf =
+            w->computeUnit->vrf[m->simdId]->
+            vrfOperandAccessReady(m->seqNum(), w, m,
+                                  VrfAccessType::WRITE);
+    }
+
+    if ((!gmReturnedStores.empty() || !gmReturnedLoads.empty()) &&
+        m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
+        accessVrf && m->statusBitVector == VectorMask(0) &&
+        (computeUnit->shader->coissue_return ||
+         computeUnit->wfWait.at(m->pipeId).rdy())) {
+
+        if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
+            doGmReturn<uint32_t, uint8_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
+            doGmReturn<uint32_t, uint16_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
+            doGmReturn<uint32_t, uint32_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
+            doGmReturn<int32_t, int8_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
+            doGmReturn<int32_t, int16_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
+            doGmReturn<int32_t, int32_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
+            doGmReturn<float, Float16>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
+            doGmReturn<float, float>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
+            doGmReturn<uint64_t, uint8_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
+            doGmReturn<uint64_t, uint16_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
+            doGmReturn<uint64_t, uint32_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
+            doGmReturn<uint64_t, uint64_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
+            doGmReturn<int64_t, int8_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
+            doGmReturn<int64_t, int16_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
+            doGmReturn<int64_t, int32_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
+            doGmReturn<int64_t, int64_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
+            doGmReturn<double, Float16>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
+            doGmReturn<double, float>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
+            doGmReturn<double, double>(m);
+    }
+
+    // If pipeline has executed a global memory instruction
+    // execute global memory packets and issue global
+    // memory packets to DTLB
+    if (!gmIssuedRequests.empty()) {
+        GPUDynInstPtr mp = gmIssuedRequests.front();
+        if (mp->m_op == Enums::MO_LD ||
+            (mp->m_op >= Enums::MO_AAND && mp->m_op <= Enums::MO_AMIN) ||
+            (mp->m_op >= Enums::MO_ANRAND && mp->m_op <= Enums::MO_ANRMIN)) {
+
+            if (inflightLoads >= gmQueueSize) {
+                return;
+            } else {
+                ++inflightLoads;
+            }
+        } else {
+            if (inflightStores >= gmQueueSize) {
+                return;
+            } else {
+                ++inflightStores;
+            }
+        }
+
+        mp->initiateAcc(mp);
+        gmIssuedRequests.pop();
+
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = %s\n",
+                computeUnit->cu_id, mp->simdId, mp->wfSlotId,
+                Enums::MemOpTypeStrings[mp->m_op]);
+    }
+}
+
+template<typename c0, typename c1>
+void
+GlobalMemPipeline::doGmReturn(GPUDynInstPtr m)
+{
+    Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+    // Return data to registers
+    if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
+        gmReturnedLoads.pop();
+        assert(inflightLoads > 0);
+        --inflightLoads;
+
+        if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) {
+            std::vector<uint32_t> regVec;
+            // iterate over number of destination register operands since
+            // this is a load or atomic operation
+            for (int k = 0; k < m->n_reg; ++k) {
+                assert((sizeof(c1) * m->n_reg) <= MAX_WIDTH_FOR_MEM_INST);
+                int dst = m->dst_reg + k;
+
+                if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
+                    dst = m->dst_reg_vec[k];
+                // virtual->physical VGPR mapping
+                int physVgpr = w->remap(dst, sizeof(c0), 1);
+                // save the physical VGPR index
+                regVec.push_back(physVgpr);
+                c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+
+                for (int i = 0; i < VSZ; ++i) {
+                    if (m->exec_mask[i]) {
+                        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
+                                "$%s%d <- %d global ld done (src = wavefront "
+                                "ld inst)\n", w->computeUnit->cu_id, w->simdId,
+                                w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
+                                dst, *p1);
+                        // write the value into the physical VGPR. This is a
+                        // purely functional operation. No timing is modeled.
+                        w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
+                                                                    *p1, i);
+                    }
+                    ++p1;
+                }
+            }
+
+            // Schedule the write operation of the load data on the VRF.
+            // This simply models the timing aspect of the VRF write operation.
+            // It does not modify the physical VGPR.
+            loadVrfBankConflictCycles +=
+                w->computeUnit->vrf[w->simdId]->exec(m->seqNum(),
+                                                     w, regVec, sizeof(c0),
+                                                     m->time);
+        }
+    } else {
+        gmReturnedStores.pop();
+        assert(inflightStores > 0);
+        --inflightStores;
+    }
+
+    // Decrement outstanding register count
+    computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1);
+
+    if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) ||
+        MO_H(m->m_op)) {
+        computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_gm, m->time,
+                                         -1);
+    }
+
+    if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
+        computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_gm, m->time,
+                                         -1);
+    }
+
+    // Mark write bus busy for appropriate amount of time
+    computeUnit->glbMemToVrfBus.set(m->time);
+    if (!computeUnit->shader->coissue_return)
+        w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+}
+
+void
+GlobalMemPipeline::regStats()
+{
+    loadVrfBankConflictCycles
+        .name(name() + ".load_vrf_bank_conflict_cycles")
+        .desc("total number of cycles GM data are delayed before updating "
+              "the VRF")
+        ;
+}
diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh
new file mode 100644
index 000000000..ed49f6f6b
--- /dev/null
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#ifndef __GLOBAL_MEMORY_PIPELINE_HH__
+#define __GLOBAL_MEMORY_PIPELINE_HH__
+
+#include <queue>
+#include <string>
+
+#include "gpu-compute/misc.hh"
+#include "params/ComputeUnit.hh"
+#include "sim/stats.hh"
+
+/*
+ * @file global_memory_pipeline.hh
+ *
+ * The global memory pipeline issues newly created global memory packets
+ * from the pipeline to DTLB. The exec() method of the memory packet issues
+ * the packet to the DTLB if there is space available in the return fifo.
+ * This stage also retires previously issued loads and stores that have
+ * returned from the memory sub-system.
+ */
+
+class ComputeUnit;
+
+class GlobalMemPipeline
+{
+  public:
+    GlobalMemPipeline(const ComputeUnitParams *params);
+    void init(ComputeUnit *cu);
+    void exec();
+
+    template<typename c0, typename c1> void doGmReturn(GPUDynInstPtr m);
+
+    std::queue<GPUDynInstPtr> &getGMReqFIFO() { return gmIssuedRequests; }
+    std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
+    std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
+
+    bool
+    isGMLdRespFIFOWrRdy() const
+    {
+        return gmReturnedLoads.size() < gmQueueSize;
+    }
+
+    bool
+    isGMStRespFIFOWrRdy() const
+    {
+        return gmReturnedStores.size() < gmQueueSize;
+    }
+
+    bool
+    isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
+    {
+        return (gmIssuedRequests.size() + pendReqs) < gmQueueSize;
+    }
+
+    const std::string &name() const { return _name; }
+    void regStats();
+
+  private:
+    ComputeUnit *computeUnit;
+    std::string _name;
+    int gmQueueSize;
+
+    // number of cycles of delaying the update of a VGPR that is the
+    // target of a load instruction (or the load component of an atomic)
+    // The delay is due to VRF bank conflicts
+    Stats::Scalar loadVrfBankConflictCycles;
+    // Counters to track the inflight loads and stores
+    // so that we can provide the proper backpressure
+    // on the number of inflight memory operations.
+    int inflightStores;
+    int inflightLoads;
+
+    // The size of global memory.
+    int globalMemSize;
+
+    // Global Memory Request FIFO: all global memory requests
+    // are issued to this FIFO from the memory pipelines
+    std::queue<GPUDynInstPtr> gmIssuedRequests;
+
+    // Globa Store Response FIFO: all responses of global memory
+    // stores are sent to this FIFO from TCP
+    std::queue<GPUDynInstPtr> gmReturnedStores;
+
+    // Global Load Response FIFO: all responses of global memory
+    // loads are sent to this FIFO from TCP
+    std::queue<GPUDynInstPtr> gmReturnedLoads;
+};
+
+#endif // __GLOBAL_MEMORY_PIPELINE_HH__
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
new file mode 100644
index 000000000..83e348dbe
--- /dev/null
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_dyn_inst.hh"
+
+#include "debug/GPUMem.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+
+GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
+                       GPUStaticInst *_staticInst, uint64_t instSeqNum)
+    : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF),
+      memoryOrder(Enums::MEMORY_ORDER_NONE), useContinuation(false),
+      statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum)
+{
+    tlbHitLevel.assign(VSZ, -1);
+}
+
+void
+GPUDynInst::execute()
+{
+    GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(cu, wf, staticInst,
+                                                            _seqNum);
+    staticInst->execute(gpuDynInst);
+}
+
+int
+GPUDynInst::numSrcRegOperands()
+{
+    return staticInst->numSrcRegOperands();
+}
+
+int
+GPUDynInst::numDstRegOperands()
+{
+    return staticInst->numDstRegOperands();
+}
+
+int
+GPUDynInst::getNumOperands()
+{
+    return staticInst->getNumOperands();
+}
+
+bool
+GPUDynInst::isVectorRegister(int operandIdx)
+{
+    return staticInst->isVectorRegister(operandIdx);
+}
+
+bool
+GPUDynInst::isScalarRegister(int operandIdx)
+{
+    return staticInst->isVectorRegister(operandIdx);
+}
+
+int
+GPUDynInst::getRegisterIndex(int operandIdx)
+{
+    return staticInst->getRegisterIndex(operandIdx);
+}
+
+int
+GPUDynInst::getOperandSize(int operandIdx)
+{
+    return staticInst->getOperandSize(operandIdx);
+}
+
+bool
+GPUDynInst::isDstOperand(int operandIdx)
+{
+    return staticInst->isDstOperand(operandIdx);
+}
+
+bool
+GPUDynInst::isSrcOperand(int operandIdx)
+{
+    return staticInst->isSrcOperand(operandIdx);
+}
+
+bool
+GPUDynInst::isArgLoad()
+{
+    return staticInst->isArgLoad();
+}
+
+const std::string&
+GPUDynInst::disassemble() const
+{
+    return staticInst->disassemble();
+}
+
+uint64_t
+GPUDynInst::seqNum() const
+{
+    return _seqNum;
+}
+
+Enums::OpType
+GPUDynInst::opType()
+{
+    return staticInst->o_type;
+}
+
+Enums::StorageClassType
+GPUDynInst::executedAs()
+{
+    return staticInst->executed_as;
+}
+
+// Process a memory instruction and (if necessary) submit timing request
+void
+GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
+{
+    DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n",
+            cu->cu_id, simdId, wfSlotId, exec_mask);
+
+    staticInst->initiateAcc(gpuDynInst);
+    time = 0;
+}
+
+bool
+GPUDynInst::scalarOp() const
+{
+    return staticInst->scalarOp();
+}
+
+void
+GPUDynInst::updateStats()
+{
+    if (staticInst->isLocalMem()) {
+        // access to LDS (shared) memory
+        cu->dynamicLMemInstrCnt++;
+    } else {
+        // access to global memory
+
+        // update PageDivergence histogram
+        int number_pages_touched = cu->pagesTouched.size();
+        assert(number_pages_touched);
+        cu->pageDivergenceDist.sample(number_pages_touched);
+
+        std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;
+
+        for (auto it : cu->pagesTouched) {
+            // see if this page has been touched before. if not, this also
+            // inserts the page into the table.
+            ret = cu->pageAccesses
+                .insert(ComputeUnit::pageDataStruct::value_type(it.first,
+                        std::make_pair(1, it.second)));
+
+            // if yes, then update the stats
+            if (!ret.second) {
+                ret.first->second.first++;
+                ret.first->second.second += it.second;
+            }
+        }
+
+        cu->pagesTouched.clear();
+
+        // total number of memory instructions (dynamic)
+        // Atomics are counted as a single memory instruction.
+        // this is # memory instructions per wavefronts, not per workitem
+        cu->dynamicGMemInstrCnt++;
+    }
+}
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
new file mode 100644
index 000000000..e44d8f80d
--- /dev/null
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __GPU_DYN_INST_HH__
+#define __GPU_DYN_INST_HH__
+
+#include <cstdint>
+#include <string>
+
+#include "enums/GenericMemoryOrder.hh"
+#include "enums/GenericMemoryScope.hh"
+#include "enums/MemOpType.hh"
+#include "enums/MemType.hh"
+#include "enums/OpType.hh"
+#include "enums/StorageClassType.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_exec_context.hh"
+
+class GPUStaticInst;
+
+template<typename T>
+class AtomicOpAnd : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+
+    AtomicOpAnd(T _a) : a(_a) { }
+    void execute(T *b) { *b &= a; }
+};
+
+template<typename T>
+class AtomicOpOr : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpOr(T _a) : a(_a) { }
+    void execute(T *b) { *b |= a; }
+};
+
+template<typename T>
+class AtomicOpXor : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpXor(T _a) : a(_a) {}
+    void execute(T *b) { *b ^= a; }
+};
+
+template<typename T>
+class AtomicOpCAS : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T c;
+    T s;
+
+    ComputeUnit *computeUnit;
+
+    AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
+      : c(_c), s(_s), computeUnit(compute_unit) { }
+
+    void
+    execute(T *b)
+    {
+        computeUnit->numCASOps++;
+
+        if (*b == c) {
+            *b = s;
+        } else {
+            computeUnit->numFailedCASOps++;
+        }
+
+        if (computeUnit->xact_cas_mode) {
+            computeUnit->xactCasLoadMap.clear();
+        }
+    }
+};
+
+template<typename T>
+class AtomicOpExch : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpExch(T _a) : a(_a) { }
+    void execute(T *b) { *b = a; }
+};
+
+template<typename T>
+class AtomicOpAdd : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpAdd(T _a) : a(_a) { }
+    void execute(T *b) { *b += a; }
+};
+
+template<typename T>
+class AtomicOpSub : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpSub(T _a) : a(_a) { }
+    void execute(T *b) { *b -= a; }
+};
+
+template<typename T>
+class AtomicOpInc : public TypedAtomicOpFunctor<T>
+{
+  public:
+    AtomicOpInc() { }
+    void execute(T *b) { *b += 1; }
+};
+
+template<typename T>
+class AtomicOpDec : public TypedAtomicOpFunctor<T>
+{
+  public:
+    AtomicOpDec() {}
+    void execute(T *b) { *b -= 1; }
+};
+
+template<typename T>
+class AtomicOpMax : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpMax(T _a) : a(_a) { }
+
+    void
+    execute(T *b)
+    {
+        if (a > *b)
+            *b = a;
+    }
+};
+
+template<typename T>
+class AtomicOpMin : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpMin(T _a) : a(_a) {}
+
+    void
+    execute(T *b)
+    {
+        if (a < *b)
+            *b = a;
+    }
+};
+
+#define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN)
+#define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN)
+#define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN)
+
+typedef enum
+{
+    VT_32,
+    VT_64,
+} vgpr_type;
+
+typedef enum
+{
+    SEG_PRIVATE,
+    SEG_SPILL,
+    SEG_GLOBAL,
+    SEG_SHARED,
+    SEG_READONLY,
+    SEG_FLAT
+} seg_type;
+
+class GPUDynInst : public GPUExecContext
+{
+  public:
+    GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
+               uint64_t instSeqNum);
+
+    void execute();
+    int numSrcRegOperands();
+    int numDstRegOperands();
+    int getNumOperands();
+    bool isVectorRegister(int operandIdx);
+    bool isScalarRegister(int operandIdx);
+    int getRegisterIndex(int operandIdx);
+    int getOperandSize(int operandIdx);
+    bool isDstOperand(int operandIdx);
+    bool isSrcOperand(int operandIdx);
+    bool isArgLoad();
+
+    const std::string &disassemble() const;
+
+    uint64_t seqNum() const;
+
+    Enums::OpType opType();
+    Enums::StorageClassType executedAs();
+
+    // The address of the memory operation
+    Addr addr[VSZ];
+    Addr pAddr;
+
+    // The data to get written
+    uint8_t d_data[VSZ * 16];
+    // Additional data (for atomics)
+    uint8_t a_data[VSZ * 8];
+    // Additional data (for atomics)
+    uint8_t x_data[VSZ * 8];
+    // The execution mask
+    VectorMask exec_mask;
+
+    // The memory type (M_U32, M_S32, ...)
+    Enums::MemType m_type;
+    // The memory operation (MO_LD, MO_ST, ...)
+    Enums::MemOpType m_op;
+    Enums::GenericMemoryOrder memoryOrder;
+
+    // Scope of the request
+    Enums::GenericMemoryScope scope;
+    // The memory segment (SEG_SHARED, SEG_GLOBAL, ...)
+    seg_type s_type;
+    // The equivalency class
+    int equiv;
+    // The return VGPR type (VT_32 or VT_64)
+    vgpr_type v_type;
+    // Number of VGPR's accessed (1, 2, or 4)
+    int n_reg;
+    // The return VGPR index
+    int dst_reg;
+    // There can be max 4 dest regs>
+    int dst_reg_vec[4];
+    // SIMD where the WF of the memory instruction has been mapped to
+    int simdId;
+    // unique id of the WF where the memory instruction belongs to
+    int wfDynId;
+    // The kernel id of the requesting wf
+    int kern_id;
+    // The CU id of the requesting wf
+    int cu_id;
+    // HW slot id where the WF is mapped to inside a SIMD unit
+    int wfSlotId;
+    // execution pipeline id where the memory instruction has been scheduled
+    int pipeId;
+    // The execution time of this operation
+    Tick time;
+    // The latency of this operation
+    WaitClass latency;
+    // A list of bank conflicts for the 4 cycles.
+    uint32_t bc[4];
+
+    // A pointer to ROM
+    uint8_t *rom;
+    // The size of the READONLY segment
+    int sz_rom;
+
+    // Initiate the specified memory operation, by creating a
+    // memory request and sending it off to the memory system.
+    void initiateAcc(GPUDynInstPtr gpuDynInst);
+
+    void updateStats();
+
+    GPUStaticInst* staticInstruction() { return staticInst; }
+
+    // Is the instruction a scalar or vector op?
+    bool scalarOp() const;
+
+    /*
+     * Loads/stores/atomics may have acquire/release semantics associated
+     * withthem. Some protocols want to see the acquire/release as separate
+     * requests from the load/store/atomic. We implement that separation
+     * using continuations (i.e., a function pointer with an object associated
+     * with it). When, for example, the front-end generates a store with
+     * release semantics, we will first issue a normal store and set the
+     * continuation in the GPUDynInst to a function that generate a
+     * release request. That continuation will be called when the normal
+     * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
+     * continuation will be called in the context of the same GPUDynInst
+     * that generated the initial store.
+     */
+    std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
+
+    // when true, call execContinuation when response arrives
+    bool useContinuation;
+
+    template<typename c0> AtomicOpFunctor*
+    makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op)
+    {
+        using namespace Enums;
+
+        switch(op) {
+          case MO_AAND:
+          case MO_ANRAND:
+            return new AtomicOpAnd<c0>(*reg0);
+          case MO_AOR:
+          case MO_ANROR:
+            return new AtomicOpOr<c0>(*reg0);
+          case MO_AXOR:
+          case MO_ANRXOR:
+            return new AtomicOpXor<c0>(*reg0);
+          case MO_ACAS:
+          case MO_ANRCAS:
+            return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
+          case MO_AEXCH:
+          case MO_ANREXCH:
+            return new AtomicOpExch<c0>(*reg0);
+          case MO_AADD:
+          case MO_ANRADD:
+            return new AtomicOpAdd<c0>(*reg0);
+          case MO_ASUB:
+          case MO_ANRSUB:
+            return new AtomicOpSub<c0>(*reg0);
+          case MO_AINC:
+          case MO_ANRINC:
+            return new AtomicOpInc<c0>();
+          case MO_ADEC:
+          case MO_ANRDEC:
+            return new AtomicOpDec<c0>();
+          case MO_AMAX:
+          case MO_ANRMAX:
+            return new AtomicOpMax<c0>(*reg0);
+          case MO_AMIN:
+          case MO_ANRMIN:
+            return new AtomicOpMin<c0>(*reg0);
+          default:
+            panic("Unrecognized atomic operation");
+        }
+    }
+
+    void
+    setRequestFlags(Request *req, bool setMemOrder=true)
+    {
+        // currently these are the easy scopes to deduce
+        switch (s_type) {
+          case SEG_PRIVATE:
+            req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
+            break;
+          case SEG_SPILL:
+            req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
+            break;
+          case SEG_GLOBAL:
+            req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
+            break;
+          case SEG_READONLY:
+            req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
+            break;
+          case SEG_SHARED:
+            req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
+            break;
+          case SEG_FLAT:
+            // TODO: translate to correct scope
+            assert(false);
+          default:
+            panic("Bad segment type");
+            break;
+        }
+
+        switch (scope) {
+          case Enums::MEMORY_SCOPE_NONE:
+          case Enums::MEMORY_SCOPE_WORKITEM:
+            break;
+          case Enums::MEMORY_SCOPE_WAVEFRONT:
+            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+                                        Request::WAVEFRONT_SCOPE);
+            break;
+          case Enums::MEMORY_SCOPE_WORKGROUP:
+            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+                                        Request::WORKGROUP_SCOPE);
+            break;
+          case Enums::MEMORY_SCOPE_DEVICE:
+            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+                                        Request::DEVICE_SCOPE);
+            break;
+          case Enums::MEMORY_SCOPE_SYSTEM:
+            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+                                        Request::SYSTEM_SCOPE);
+            break;
+          default:
+            panic("Bad scope type");
+            break;
+        }
+
+        if (setMemOrder) {
+            // set acquire and release flags
+            switch (memoryOrder){
+              case Enums::MEMORY_ORDER_SC_ACQUIRE:
+                req->setFlags(Request::ACQUIRE);
+                break;
+              case Enums::MEMORY_ORDER_SC_RELEASE:
+                req->setFlags(Request::RELEASE);
+                break;
+              case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE:
+                req->setFlags(Request::ACQUIRE | Request::RELEASE);
+                break;
+              default:
+                break;
+            }
+        }
+
+        // set atomic type
+        // currently, the instruction genenerator only produces atomic return
+        // but a magic instruction can produce atomic no return
+        if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB ||
+            m_op == Enums::MO_AAND || m_op == Enums::MO_AOR ||
+            m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX ||
+            m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC ||
+            m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH ||
+            m_op == Enums::MO_ACAS) {
+            req->setFlags(Request::ATOMIC_RETURN_OP);
+        } else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB ||
+                   m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR ||
+                   m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX ||
+                   m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC ||
+                   m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH ||
+                   m_op == Enums::MO_ANRCAS) {
+            req->setFlags(Request::ATOMIC_NO_RETURN_OP);
+        }
+    }
+
+    // Map returned packets and the addresses they satisfy with which lane they
+    // were requested from
+    typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
+    StatusVector memStatusVector;
+
+    // Track the status of memory requests per lane, a bit per lane
+    VectorMask statusBitVector;
+    // for ld_v# or st_v#
+    std::vector<int> statusVector;
+    std::vector<int> tlbHitLevel;
+
+  private:
+    GPUStaticInst *staticInst;
+    uint64_t _seqNum;
+};
+
+#endif // __GPU_DYN_INST_HH__
diff --git a/src/gpu-compute/gpu_exec_context.cc b/src/gpu-compute/gpu_exec_context.cc
new file mode 100644
index 000000000..4af69c41e
--- /dev/null
+++ b/src/gpu-compute/gpu_exec_context.cc
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_exec_context.hh"
+
+GPUExecContext::GPUExecContext(ComputeUnit *_cu, Wavefront *_wf)
+    : cu(_cu), wf(_wf)
+{
+}
+
+ComputeUnit*
+GPUExecContext::computeUnit()
+{
+    return cu;
+}
+
+Wavefront*
+GPUExecContext::wavefront()
+{
+    return wf;
+}
diff --git a/src/gpu-compute/gpu_exec_context.hh b/src/gpu-compute/gpu_exec_context.hh
new file mode 100644
index 000000000..a3deb9b8f
--- /dev/null
+++ b/src/gpu-compute/gpu_exec_context.hh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __GPU_EXEC_CONTEXT_HH__
+#define __GPU_EXEC_CONTEXT_HH__
+
+class ComputeUnit;
+class Wavefront;
+
+class GPUExecContext
+{
+  public:
+    GPUExecContext(ComputeUnit *_cu, Wavefront *_wf);
+    Wavefront* wavefront();
+    ComputeUnit* computeUnit();
+
+  protected:
+    ComputeUnit *cu;
+    Wavefront *wf;
+};
+
+#endif // __GPU_EXEC_CONTEXT_HH__
diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc
new file mode 100644
index 000000000..bcb8a5f3d
--- /dev/null
+++ b/src/gpu-compute/gpu_static_inst.cc
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_static_inst.hh"
+
+GPUStaticInst::GPUStaticInst(const std::string &opcode)
+    : o_type(Enums::OT_ALU), executed_as(Enums::SC_NONE), opcode(opcode),
+      _instNum(0), _scalarOp(false)
+{
+}
diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh
new file mode 100644
index 000000000..c1de28427
--- /dev/null
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __GPU_STATIC_INST_HH__
+#define __GPU_STATIC_INST_HH__
+
+/*
+ * @file gpu_static_inst.hh
+ *
+ * Defines the base class representing static instructions for the GPU. The
+ * instructions are "static" because they contain no dynamic instruction
+ * information. GPUStaticInst corresponds to the StaticInst class for the CPU
+ * models.
+ */
+
+#include <cstdint>
+#include <string>
+
+#include "enums/OpType.hh"
+#include "enums/StorageClassType.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/misc.hh"
+
+class BaseOperand;
+class BaseRegOperand;
+class Wavefront;
+
+class GPUStaticInst
+{
+  public:
+    GPUStaticInst(const std::string &opcode);
+
+    void instNum(int num) { _instNum = num; }
+
+    int instNum() { return _instNum;  }
+
+    void ipdInstNum(int num) { _ipdInstNum = num; }
+
+    int ipdInstNum() const { return _ipdInstNum; }
+
+    virtual void execute(GPUDynInstPtr gpuDynInst) = 0;
+    virtual void generateDisassembly() = 0;
+    virtual const std::string &disassemble() = 0;
+    virtual int getNumOperands() = 0;
+    virtual bool isCondRegister(int operandIndex) = 0;
+    virtual bool isScalarRegister(int operandIndex) = 0;
+    virtual bool isVectorRegister(int operandIndex) = 0;
+    virtual bool isSrcOperand(int operandIndex) = 0;
+    virtual bool isDstOperand(int operandIndex) = 0;
+    virtual int getOperandSize(int operandIndex) = 0;
+    virtual int getRegisterIndex(int operandIndex) = 0;
+    virtual int numDstRegOperands() = 0;
+    virtual int numSrcRegOperands() = 0;
+
+    /*
+     * Most instructions (including all HSAIL instructions)
+     * are vector ops, so _scalarOp will be false by default.
+     * Derived instruction objects that are scalar ops must
+     * set _scalarOp to true in their constructors.
+     */
+    bool scalarOp() const { return _scalarOp; }
+
+    virtual bool isLocalMem() const
+    {
+        fatal("calling isLocalMem() on non-memory instruction.\n");
+
+        return false;
+    }
+
+    bool isArgLoad() { return false; }
+    virtual uint32_t instSize() = 0;
+
+    // only used for memory instructions
+    virtual void
+    initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        fatal("calling initiateAcc() on a non-memory instruction.\n");
+    }
+
+    virtual uint32_t getTargetPc() { return 0; }
+
+    /**
+     * Query whether the instruction is an unconditional jump i.e., the jump
+     * is always executed because there is no condition to be evaluated.
+     *
+     * If the instruction is not of branch type, the result is always false.
+     *
+     * @return True if the instruction is an unconditional jump.
+     */
+    virtual bool unconditionalJumpInstruction() { return false; }
+
+    static uint64_t dynamic_id_count;
+
+    Enums::OpType o_type;
+    // For flat memory accesses
+    Enums::StorageClassType executed_as;
+
+  protected:
+    virtual void
+    execLdAcq(GPUDynInstPtr gpuDynInst)
+    {
+        fatal("calling execLdAcq() on a non-load instruction.\n");
+    }
+
+    virtual void
+    execSt(GPUDynInstPtr gpuDynInst)
+    {
+        fatal("calling execLdAcq() on a non-load instruction.\n");
+    }
+
+    virtual void
+    execAtomic(GPUDynInstPtr gpuDynInst)
+    {
+        fatal("calling execAtomic() on a non-atomic instruction.\n");
+    }
+
+    virtual void
+    execAtomicAcq(GPUDynInstPtr gpuDynInst)
+    {
+        fatal("calling execAtomicAcq() on a non-atomic instruction.\n");
+    }
+
+    const std::string opcode;
+    std::string disassembly;
+    int _instNum;
+    /**
+     * Identifier of the immediate post-dominator instruction.
+     */
+    int _ipdInstNum;
+
+    bool _scalarOp;
+};
+
+#endif // __GPU_STATIC_INST_HH__
diff --git a/src/gpu-compute/gpu_tlb.cc b/src/gpu-compute/gpu_tlb.cc
new file mode 100644
index 000000000..de005fd04
--- /dev/null
+++ b/src/gpu-compute/gpu_tlb.cc
@@ -0,0 +1,1801 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#include "gpu-compute/gpu_tlb.hh"
+
+#include <cmath>
+#include <cstring>
+
+#include "arch/x86/faults.hh"
+#include "arch/x86/insts/microldstop.hh"
+#include "arch/x86/pagetable.hh"
+#include "arch/x86/pagetable_walker.hh"
+#include "arch/x86/regs/misc.hh"
+#include "arch/x86/x86_traits.hh"
+#include "base/bitfield.hh"
+#include "base/output.hh"
+#include "base/trace.hh"
+#include "cpu/base.hh"
+#include "cpu/thread_context.hh"
+#include "debug/GPUPrefetch.hh"
+#include "debug/GPUTLB.hh"
+#include "mem/packet_access.hh"
+#include "mem/page_table.hh"
+#include "mem/request.hh"
+#include "sim/process.hh"
+
+namespace X86ISA
+{
+
+    GpuTLB::GpuTLB(const Params *p)
+        : MemObject(p), configAddress(0), size(p->size),
+          cleanupEvent(this, false, Event::Maximum_Pri), exitEvent(this)
+    {
+        assoc = p->assoc;
+        assert(assoc <= size);
+        numSets = size/assoc;
+        allocationPolicy = p->allocationPolicy;
+        hasMemSidePort = false;
+        accessDistance = p->accessDistance;
+        clock = p->clk_domain->clockPeriod();
+
+        tlb = new GpuTlbEntry[size];
+        std::memset(tlb, 0, sizeof(GpuTlbEntry) * size);
+
+        freeList.resize(numSets);
+        entryList.resize(numSets);
+
+        for (int set = 0; set < numSets; ++set) {
+            for (int way = 0; way < assoc; ++way) {
+                int x = set*assoc + way;
+                freeList[set].push_back(&tlb[x]);
+            }
+        }
+
+        FA = (size == assoc);
+
+        /**
+         * @warning: the set-associative version assumes you have a
+         * fixed page size of 4KB.
+         * If the page size is greather than 4KB (as defined in the
+         * TheISA::PageBytes), then there are various issues w/ the current
+         * implementation (you'd have the same 8KB page being replicated in
+         * different sets etc)
+         */
+        setMask = numSets - 1;
+
+    #if 0
+        // GpuTLB doesn't yet support full system
+        walker = p->walker;
+        walker->setTLB(this);
+    #endif
+
+        maxCoalescedReqs = p->maxOutstandingReqs;
+
+        // Do not allow maxCoalescedReqs to be more than the TLB associativity
+        if (maxCoalescedReqs > assoc) {
+            maxCoalescedReqs = assoc;
+            cprintf("Forcing maxCoalescedReqs to %d (TLB assoc.) \n", assoc);
+        }
+
+        outstandingReqs = 0;
+        hitLatency = p->hitLatency;
+        missLatency1 = p->missLatency1;
+        missLatency2 = p->missLatency2;
+
+        // create the slave ports based on the number of connected ports
+        for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
+            cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d",
+                                  name(), i), this, i));
+        }
+
+        // create the master ports based on the number of connected ports
+        for (size_t i = 0; i < p->port_master_connection_count; ++i) {
+            memSidePort.push_back(new MemSidePort(csprintf("%s-port%d",
+                                  name(), i), this, i));
+        }
+    }
+
+    // fixme: this is never called?
+    GpuTLB::~GpuTLB()
+    {
+        // make sure all the hash-maps are empty
+        assert(translationReturnEvent.empty());
+
+        // delete the TLB
+        delete[] tlb;
+    }
+
+    BaseSlavePort&
+    GpuTLB::getSlavePort(const std::string &if_name, PortID idx)
+    {
+        if (if_name == "slave") {
+            if (idx >= static_cast<PortID>(cpuSidePort.size())) {
+                panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx);
+            }
+
+            return *cpuSidePort[idx];
+        } else {
+            panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name);
+        }
+    }
+
+    BaseMasterPort&
+    GpuTLB::getMasterPort(const std::string &if_name, PortID idx)
+    {
+        if (if_name == "master") {
+            if (idx >= static_cast<PortID>(memSidePort.size())) {
+                panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx);
+            }
+
+            hasMemSidePort = true;
+
+            return *memSidePort[idx];
+        } else {
+            panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name);
+        }
+    }
+
+    GpuTlbEntry*
+    GpuTLB::insert(Addr vpn, GpuTlbEntry &entry)
+    {
+        GpuTlbEntry *newEntry = nullptr;
+
+        /**
+         * vpn holds the virtual page address
+         * The least significant bits are simply masked
+         */
+        int set = (vpn >> TheISA::PageShift) & setMask;
+
+        if (!freeList[set].empty()) {
+            newEntry = freeList[set].front();
+            freeList[set].pop_front();
+        } else {
+            newEntry = entryList[set].back();
+            entryList[set].pop_back();
+        }
+
+        *newEntry = entry;
+        newEntry->vaddr = vpn;
+        entryList[set].push_front(newEntry);
+
+        return newEntry;
+    }
+
+    GpuTLB::EntryList::iterator
+    GpuTLB::lookupIt(Addr va, bool update_lru)
+    {
+        int set = (va >> TheISA::PageShift) & setMask;
+
+        if (FA) {
+            assert(!set);
+        }
+
+        auto entry = entryList[set].begin();
+        for (; entry != entryList[set].end(); ++entry) {
+            int page_size = (*entry)->size();
+
+            if ((*entry)->vaddr <= va && (*entry)->vaddr + page_size > va) {
+                DPRINTF(GPUTLB, "Matched vaddr %#x to entry starting at %#x "
+                        "with size %#x.\n", va, (*entry)->vaddr, page_size);
+
+                if (update_lru) {
+                    entryList[set].push_front(*entry);
+                    entryList[set].erase(entry);
+                    entry = entryList[set].begin();
+                }
+
+                break;
+            }
+        }
+
+        return entry;
+    }
+
+    GpuTlbEntry*
+    GpuTLB::lookup(Addr va, bool update_lru)
+    {
+        int set = (va >> TheISA::PageShift) & setMask;
+
+        auto entry = lookupIt(va, update_lru);
+
+        if (entry == entryList[set].end())
+            return nullptr;
+        else
+            return *entry;
+    }
+
+    void
+    GpuTLB::invalidateAll()
+    {
+        DPRINTF(GPUTLB, "Invalidating all entries.\n");
+
+        for (int i = 0; i < numSets; ++i) {
+            while (!entryList[i].empty()) {
+                GpuTlbEntry *entry = entryList[i].front();
+                entryList[i].pop_front();
+                freeList[i].push_back(entry);
+            }
+        }
+    }
+
+    void
+    GpuTLB::setConfigAddress(uint32_t addr)
+    {
+        configAddress = addr;
+    }
+
+    void
+    GpuTLB::invalidateNonGlobal()
+    {
+        DPRINTF(GPUTLB, "Invalidating all non global entries.\n");
+
+        for (int i = 0; i < numSets; ++i) {
+            for (auto entryIt = entryList[i].begin();
+                 entryIt != entryList[i].end();) {
+                if (!(*entryIt)->global) {
+                    freeList[i].push_back(*entryIt);
+                    entryList[i].erase(entryIt++);
+                } else {
+                    ++entryIt;
+                }
+            }
+        }
+    }
+
+    void
+    GpuTLB::demapPage(Addr va, uint64_t asn)
+    {
+
+        int set = (va >> TheISA::PageShift) & setMask;
+        auto entry = lookupIt(va, false);
+
+        if (entry != entryList[set].end()) {
+            freeList[set].push_back(*entry);
+            entryList[set].erase(entry);
+        }
+    }
+
+    Fault
+    GpuTLB::translateInt(RequestPtr req, ThreadContext *tc)
+    {
+        DPRINTF(GPUTLB, "Addresses references internal memory.\n");
+        Addr vaddr = req->getVaddr();
+        Addr prefix = (vaddr >> 3) & IntAddrPrefixMask;
+
+        if (prefix == IntAddrPrefixCPUID) {
+            panic("CPUID memory space not yet implemented!\n");
+        } else if (prefix == IntAddrPrefixMSR) {
+            vaddr = vaddr >> 3;
+            req->setFlags(Request::MMAPPED_IPR);
+            Addr regNum = 0;
+
+            switch (vaddr & ~IntAddrPrefixMask) {
+              case 0x10:
+                regNum = MISCREG_TSC;
+                break;
+              case 0x1B:
+                regNum = MISCREG_APIC_BASE;
+                break;
+              case 0xFE:
+                regNum = MISCREG_MTRRCAP;
+                break;
+              case 0x174:
+                regNum = MISCREG_SYSENTER_CS;
+                break;
+              case 0x175:
+                regNum = MISCREG_SYSENTER_ESP;
+                break;
+              case 0x176:
+                regNum = MISCREG_SYSENTER_EIP;
+                break;
+              case 0x179:
+                regNum = MISCREG_MCG_CAP;
+                break;
+              case 0x17A:
+                regNum = MISCREG_MCG_STATUS;
+                break;
+              case 0x17B:
+                regNum = MISCREG_MCG_CTL;
+                break;
+              case 0x1D9:
+                regNum = MISCREG_DEBUG_CTL_MSR;
+                break;
+              case 0x1DB:
+                regNum = MISCREG_LAST_BRANCH_FROM_IP;
+                break;
+              case 0x1DC:
+                regNum = MISCREG_LAST_BRANCH_TO_IP;
+                break;
+              case 0x1DD:
+                regNum = MISCREG_LAST_EXCEPTION_FROM_IP;
+                break;
+              case 0x1DE:
+                regNum = MISCREG_LAST_EXCEPTION_TO_IP;
+                break;
+              case 0x200:
+                regNum = MISCREG_MTRR_PHYS_BASE_0;
+                break;
+              case 0x201:
+                regNum = MISCREG_MTRR_PHYS_MASK_0;
+                break;
+              case 0x202:
+                regNum = MISCREG_MTRR_PHYS_BASE_1;
+                break;
+              case 0x203:
+                regNum = MISCREG_MTRR_PHYS_MASK_1;
+                break;
+              case 0x204:
+                regNum = MISCREG_MTRR_PHYS_BASE_2;
+                break;
+              case 0x205:
+                regNum = MISCREG_MTRR_PHYS_MASK_2;
+                break;
+              case 0x206:
+                regNum = MISCREG_MTRR_PHYS_BASE_3;
+                break;
+              case 0x207:
+                regNum = MISCREG_MTRR_PHYS_MASK_3;
+                break;
+              case 0x208:
+                regNum = MISCREG_MTRR_PHYS_BASE_4;
+                break;
+              case 0x209:
+                regNum = MISCREG_MTRR_PHYS_MASK_4;
+                break;
+              case 0x20A:
+                regNum = MISCREG_MTRR_PHYS_BASE_5;
+                break;
+              case 0x20B:
+                regNum = MISCREG_MTRR_PHYS_MASK_5;
+                break;
+              case 0x20C:
+                regNum = MISCREG_MTRR_PHYS_BASE_6;
+                break;
+              case 0x20D:
+                regNum = MISCREG_MTRR_PHYS_MASK_6;
+                break;
+              case 0x20E:
+                regNum = MISCREG_MTRR_PHYS_BASE_7;
+                break;
+              case 0x20F:
+                regNum = MISCREG_MTRR_PHYS_MASK_7;
+                break;
+              case 0x250:
+                regNum = MISCREG_MTRR_FIX_64K_00000;
+                break;
+              case 0x258:
+                regNum = MISCREG_MTRR_FIX_16K_80000;
+                break;
+              case 0x259:
+                regNum = MISCREG_MTRR_FIX_16K_A0000;
+                break;
+              case 0x268:
+                regNum = MISCREG_MTRR_FIX_4K_C0000;
+                break;
+              case 0x269:
+                regNum = MISCREG_MTRR_FIX_4K_C8000;
+                break;
+              case 0x26A:
+                regNum = MISCREG_MTRR_FIX_4K_D0000;
+                break;
+              case 0x26B:
+                regNum = MISCREG_MTRR_FIX_4K_D8000;
+                break;
+              case 0x26C:
+                regNum = MISCREG_MTRR_FIX_4K_E0000;
+                break;
+              case 0x26D:
+                regNum = MISCREG_MTRR_FIX_4K_E8000;
+                break;
+              case 0x26E:
+                regNum = MISCREG_MTRR_FIX_4K_F0000;
+                break;
+              case 0x26F:
+                regNum = MISCREG_MTRR_FIX_4K_F8000;
+                break;
+              case 0x277:
+                regNum = MISCREG_PAT;
+                break;
+              case 0x2FF:
+                regNum = MISCREG_DEF_TYPE;
+                break;
+              case 0x400:
+                regNum = MISCREG_MC0_CTL;
+                break;
+              case 0x404:
+                regNum = MISCREG_MC1_CTL;
+                break;
+              case 0x408:
+                regNum = MISCREG_MC2_CTL;
+                break;
+              case 0x40C:
+                regNum = MISCREG_MC3_CTL;
+                break;
+              case 0x410:
+                regNum = MISCREG_MC4_CTL;
+                break;
+              case 0x414:
+                regNum = MISCREG_MC5_CTL;
+                break;
+              case 0x418:
+                regNum = MISCREG_MC6_CTL;
+                break;
+              case 0x41C:
+                regNum = MISCREG_MC7_CTL;
+                break;
+              case 0x401:
+                regNum = MISCREG_MC0_STATUS;
+                break;
+              case 0x405:
+                regNum = MISCREG_MC1_STATUS;
+                break;
+              case 0x409:
+                regNum = MISCREG_MC2_STATUS;
+                break;
+              case 0x40D:
+                regNum = MISCREG_MC3_STATUS;
+                break;
+              case 0x411:
+                regNum = MISCREG_MC4_STATUS;
+                break;
+              case 0x415:
+                regNum = MISCREG_MC5_STATUS;
+                break;
+              case 0x419:
+                regNum = MISCREG_MC6_STATUS;
+                break;
+              case 0x41D:
+                regNum = MISCREG_MC7_STATUS;
+                break;
+              case 0x402:
+                regNum = MISCREG_MC0_ADDR;
+                break;
+              case 0x406:
+                regNum = MISCREG_MC1_ADDR;
+                break;
+              case 0x40A:
+                regNum = MISCREG_MC2_ADDR;
+                break;
+              case 0x40E:
+                regNum = MISCREG_MC3_ADDR;
+                break;
+              case 0x412:
+                regNum = MISCREG_MC4_ADDR;
+                break;
+              case 0x416:
+                regNum = MISCREG_MC5_ADDR;
+                break;
+              case 0x41A:
+                regNum = MISCREG_MC6_ADDR;
+                break;
+              case 0x41E:
+                regNum = MISCREG_MC7_ADDR;
+                break;
+              case 0x403:
+                regNum = MISCREG_MC0_MISC;
+                break;
+              case 0x407:
+                regNum = MISCREG_MC1_MISC;
+                break;
+              case 0x40B:
+                regNum = MISCREG_MC2_MISC;
+                break;
+              case 0x40F:
+                regNum = MISCREG_MC3_MISC;
+                break;
+              case 0x413:
+                regNum = MISCREG_MC4_MISC;
+                break;
+              case 0x417:
+                regNum = MISCREG_MC5_MISC;
+                break;
+              case 0x41B:
+                regNum = MISCREG_MC6_MISC;
+                break;
+              case 0x41F:
+                regNum = MISCREG_MC7_MISC;
+                break;
+              case 0xC0000080:
+                regNum = MISCREG_EFER;
+                break;
+              case 0xC0000081:
+                regNum = MISCREG_STAR;
+                break;
+              case 0xC0000082:
+                regNum = MISCREG_LSTAR;
+                break;
+              case 0xC0000083:
+                regNum = MISCREG_CSTAR;
+                break;
+              case 0xC0000084:
+                regNum = MISCREG_SF_MASK;
+                break;
+              case 0xC0000100:
+                regNum = MISCREG_FS_BASE;
+                break;
+              case 0xC0000101:
+                regNum = MISCREG_GS_BASE;
+                break;
+              case 0xC0000102:
+                regNum = MISCREG_KERNEL_GS_BASE;
+                break;
+              case 0xC0000103:
+                regNum = MISCREG_TSC_AUX;
+                break;
+              case 0xC0010000:
+                regNum = MISCREG_PERF_EVT_SEL0;
+                break;
+              case 0xC0010001:
+                regNum = MISCREG_PERF_EVT_SEL1;
+                break;
+              case 0xC0010002:
+                regNum = MISCREG_PERF_EVT_SEL2;
+                break;
+              case 0xC0010003:
+                regNum = MISCREG_PERF_EVT_SEL3;
+                break;
+              case 0xC0010004:
+                regNum = MISCREG_PERF_EVT_CTR0;
+                break;
+              case 0xC0010005:
+                regNum = MISCREG_PERF_EVT_CTR1;
+                break;
+              case 0xC0010006:
+                regNum = MISCREG_PERF_EVT_CTR2;
+                break;
+              case 0xC0010007:
+                regNum = MISCREG_PERF_EVT_CTR3;
+                break;
+              case 0xC0010010:
+                regNum = MISCREG_SYSCFG;
+                break;
+              case 0xC0010016:
+                regNum = MISCREG_IORR_BASE0;
+                break;
+              case 0xC0010017:
+                regNum = MISCREG_IORR_BASE1;
+                break;
+              case 0xC0010018:
+                regNum = MISCREG_IORR_MASK0;
+                break;
+              case 0xC0010019:
+                regNum = MISCREG_IORR_MASK1;
+                break;
+              case 0xC001001A:
+                regNum = MISCREG_TOP_MEM;
+                break;
+              case 0xC001001D:
+                regNum = MISCREG_TOP_MEM2;
+                break;
+              case 0xC0010114:
+                regNum = MISCREG_VM_CR;
+                break;
+              case 0xC0010115:
+                regNum = MISCREG_IGNNE;
+                break;
+              case 0xC0010116:
+                regNum = MISCREG_SMM_CTL;
+                break;
+              case 0xC0010117:
+                regNum = MISCREG_VM_HSAVE_PA;
+                break;
+              default:
+                return std::make_shared<GeneralProtection>(0);
+            }
+            //The index is multiplied by the size of a MiscReg so that
+            //any memory dependence calculations will not see these as
+            //overlapping.
+            req->setPaddr(regNum * sizeof(MiscReg));
+            return NoFault;
+        } else if (prefix == IntAddrPrefixIO) {
+            // TODO If CPL > IOPL or in virtual mode, check the I/O permission
+            // bitmap in the TSS.
+
+            Addr IOPort = vaddr & ~IntAddrPrefixMask;
+            // Make sure the address fits in the expected 16 bit IO address
+            // space.
+            assert(!(IOPort & ~0xFFFF));
+
+            if (IOPort == 0xCF8 && req->getSize() == 4) {
+                req->setFlags(Request::MMAPPED_IPR);
+                req->setPaddr(MISCREG_PCI_CONFIG_ADDRESS * sizeof(MiscReg));
+            } else if ((IOPort & ~mask(2)) == 0xCFC) {
+                req->setFlags(Request::UNCACHEABLE);
+
+                Addr configAddress =
+                    tc->readMiscRegNoEffect(MISCREG_PCI_CONFIG_ADDRESS);
+
+                if (bits(configAddress, 31, 31)) {
+                    req->setPaddr(PhysAddrPrefixPciConfig |
+                                  mbits(configAddress, 30, 2) |
+                                  (IOPort & mask(2)));
+                } else {
+                    req->setPaddr(PhysAddrPrefixIO | IOPort);
+                }
+            } else {
+                req->setFlags(Request::UNCACHEABLE);
+                req->setPaddr(PhysAddrPrefixIO | IOPort);
+            }
+            return NoFault;
+        } else {
+            panic("Access to unrecognized internal address space %#x.\n",
+                  prefix);
+        }
+    }
+
+    /**
+     * TLB_lookup will only perform a TLB lookup returning true on a TLB hit
+     * and false on a TLB miss.
+     * Many of the checks about different modes have been converted to
+     * assertions, since these parts of the code are not really used.
+     * On a hit it will update the LRU stack.
+     */
+    bool
+    GpuTLB::tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats)
+    {
+        bool tlb_hit = false;
+    #ifndef NDEBUG
+        uint32_t flags = req->getFlags();
+        int seg = flags & SegmentFlagMask;
+    #endif
+
+        assert(seg != SEGMENT_REG_MS);
+        Addr vaddr = req->getVaddr();
+        DPRINTF(GPUTLB, "TLB Lookup for vaddr %#x.\n", vaddr);
+        HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
+
+        if (m5Reg.prot) {
+            DPRINTF(GPUTLB, "In protected mode.\n");
+            // make sure we are in 64-bit mode
+            assert(m5Reg.mode == LongMode);
+
+            // If paging is enabled, do the translation.
+            if (m5Reg.paging) {
+                DPRINTF(GPUTLB, "Paging enabled.\n");
+                //update LRU stack on a hit
+                GpuTlbEntry *entry = lookup(vaddr, true);
+
+                if (entry)
+                    tlb_hit = true;
+
+                if (!update_stats) {
+                    // functional tlb access for memory initialization
+                    // i.e., memory seeding or instr. seeding -> don't update
+                    // TLB and stats
+                    return tlb_hit;
+                }
+
+                localNumTLBAccesses++;
+
+                if (!entry) {
+                    localNumTLBMisses++;
+                } else {
+                    localNumTLBHits++;
+                }
+            }
+        }
+
+        return tlb_hit;
+    }
+
+    Fault
+    GpuTLB::translate(RequestPtr req, ThreadContext *tc,
+                      Translation *translation, Mode mode,
+                      bool &delayedResponse, bool timing, int &latency)
+    {
+        uint32_t flags = req->getFlags();
+        int seg = flags & SegmentFlagMask;
+        bool storeCheck = flags & (StoreCheck << FlagShift);
+
+        // If this is true, we're dealing with a request
+        // to a non-memory address space.
+        if (seg == SEGMENT_REG_MS) {
+            return translateInt(req, tc);
+        }
+
+        delayedResponse = false;
+        Addr vaddr = req->getVaddr();
+        DPRINTF(GPUTLB, "Translating vaddr %#x.\n", vaddr);
+
+        HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
+
+        // If protected mode has been enabled...
+        if (m5Reg.prot) {
+            DPRINTF(GPUTLB, "In protected mode.\n");
+            // If we're not in 64-bit mode, do protection/limit checks
+            if (m5Reg.mode != LongMode) {
+                DPRINTF(GPUTLB, "Not in long mode. Checking segment "
+                        "protection.\n");
+
+                // Check for a null segment selector.
+                if (!(seg == SEGMENT_REG_TSG || seg == SYS_SEGMENT_REG_IDTR ||
+                    seg == SEGMENT_REG_HS || seg == SEGMENT_REG_LS)
+                    && !tc->readMiscRegNoEffect(MISCREG_SEG_SEL(seg))) {
+                    return std::make_shared<GeneralProtection>(0);
+                }
+
+                bool expandDown = false;
+                SegAttr attr = tc->readMiscRegNoEffect(MISCREG_SEG_ATTR(seg));
+
+                if (seg >= SEGMENT_REG_ES && seg <= SEGMENT_REG_HS) {
+                    if (!attr.writable && (mode == BaseTLB::Write ||
+                        storeCheck))
+                        return std::make_shared<GeneralProtection>(0);
+
+                    if (!attr.readable && mode == BaseTLB::Read)
+                        return std::make_shared<GeneralProtection>(0);
+
+                    expandDown = attr.expandDown;
+
+                }
+
+                Addr base = tc->readMiscRegNoEffect(MISCREG_SEG_BASE(seg));
+                Addr limit = tc->readMiscRegNoEffect(MISCREG_SEG_LIMIT(seg));
+                // This assumes we're not in 64 bit mode. If we were, the
+                // default address size is 64 bits, overridable to 32.
+                int size = 32;
+                bool sizeOverride = (flags & (AddrSizeFlagBit << FlagShift));
+                SegAttr csAttr = tc->readMiscRegNoEffect(MISCREG_CS_ATTR);
+
+                if ((csAttr.defaultSize && sizeOverride) ||
+                    (!csAttr.defaultSize && !sizeOverride)) {
+                    size = 16;
+                }
+
+                Addr offset = bits(vaddr - base, size - 1, 0);
+                Addr endOffset = offset + req->getSize() - 1;
+
+                if (expandDown) {
+                    DPRINTF(GPUTLB, "Checking an expand down segment.\n");
+                    warn_once("Expand down segments are untested.\n");
+
+                    if (offset <= limit || endOffset <= limit)
+                        return std::make_shared<GeneralProtection>(0);
+                } else {
+                    if (offset > limit || endOffset > limit)
+                        return std::make_shared<GeneralProtection>(0);
+                }
+            }
+
+            // If paging is enabled, do the translation.
+            if (m5Reg.paging) {
+                DPRINTF(GPUTLB, "Paging enabled.\n");
+                // The vaddr already has the segment base applied.
+                GpuTlbEntry *entry = lookup(vaddr);
+                localNumTLBAccesses++;
+
+                if (!entry) {
+                    localNumTLBMisses++;
+                    if (timing) {
+                        latency = missLatency1;
+                    }
+
+                    if (FullSystem) {
+                        fatal("GpuTLB doesn't support full-system mode\n");
+                    } else {
+                        DPRINTF(GPUTLB, "Handling a TLB miss for address %#x "
+                                "at pc %#x.\n", vaddr, tc->instAddr());
+
+                        Process *p = tc->getProcessPtr();
+                        GpuTlbEntry newEntry;
+                        bool success = p->pTable->lookup(vaddr, newEntry);
+
+                        if (!success && mode != BaseTLB::Execute) {
+                            // penalize a "page fault" more
+                            if (timing) {
+                                latency += missLatency2;
+                            }
+
+                            if (p->fixupStackFault(vaddr))
+                                success = p->pTable->lookup(vaddr, newEntry);
+                        }
+
+                        if (!success) {
+                            return std::make_shared<PageFault>(vaddr, true,
+                                                               mode, true,
+                                                               false);
+                        } else {
+                            newEntry.valid = success;
+                            Addr alignedVaddr = p->pTable->pageAlign(vaddr);
+
+                            DPRINTF(GPUTLB, "Mapping %#x to %#x\n",
+                                    alignedVaddr, newEntry.pageStart());
+
+                            entry = insert(alignedVaddr, newEntry);
+                        }
+
+                        DPRINTF(GPUTLB, "Miss was serviced.\n");
+                    }
+                } else {
+                    localNumTLBHits++;
+
+                    if (timing) {
+                        latency = hitLatency;
+                    }
+                }
+
+                // Do paging protection checks.
+                bool inUser = (m5Reg.cpl == 3 &&
+                               !(flags & (CPL0FlagBit << FlagShift)));
+
+                CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
+                bool badWrite = (!entry->writable && (inUser || cr0.wp));
+
+                if ((inUser && !entry->user) || (mode == BaseTLB::Write &&
+                     badWrite)) {
+                    // The page must have been present to get into the TLB in
+                    // the first place. We'll assume the reserved bits are
+                    // fine even though we're not checking them.
+                    return std::make_shared<PageFault>(vaddr, true, mode,
+                                                       inUser, false);
+                }
+
+                if (storeCheck && badWrite) {
+                    // This would fault if this were a write, so return a page
+                    // fault that reflects that happening.
+                    return std::make_shared<PageFault>(vaddr, true,
+                                                       BaseTLB::Write,
+                                                       inUser, false);
+                }
+
+
+                DPRINTF(GPUTLB, "Entry found with paddr %#x, doing protection "
+                        "checks.\n", entry->paddr);
+
+                int page_size = entry->size();
+                Addr paddr = entry->paddr | (vaddr & (page_size - 1));
+                DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
+                req->setPaddr(paddr);
+
+                if (entry->uncacheable)
+                    req->setFlags(Request::UNCACHEABLE);
+            } else {
+                //Use the address which already has segmentation applied.
+                DPRINTF(GPUTLB, "Paging disabled.\n");
+                DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
+                req->setPaddr(vaddr);
+            }
+        } else {
+            // Real mode
+            DPRINTF(GPUTLB, "In real mode.\n");
+            DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
+            req->setPaddr(vaddr);
+        }
+
+        // Check for an access to the local APIC
+        if (FullSystem) {
+            LocalApicBase localApicBase =
+                tc->readMiscRegNoEffect(MISCREG_APIC_BASE);
+
+            Addr baseAddr = localApicBase.base * PageBytes;
+            Addr paddr = req->getPaddr();
+
+            if (baseAddr <= paddr && baseAddr + PageBytes > paddr) {
+                // Force the access to be uncacheable.
+                req->setFlags(Request::UNCACHEABLE);
+                req->setPaddr(x86LocalAPICAddress(tc->contextId(),
+                                                  paddr - baseAddr));
+            }
+        }
+
+        return NoFault;
+    };
+
+    Fault
+    GpuTLB::translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
+                            int &latency)
+    {
+        bool delayedResponse;
+
+        return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false,
+                                 latency);
+    }
+
+    void
+    GpuTLB::translateTiming(RequestPtr req, ThreadContext *tc,
+            Translation *translation, Mode mode, int &latency)
+    {
+        bool delayedResponse;
+        assert(translation);
+
+        Fault fault = GpuTLB::translate(req, tc, translation, mode,
+                                        delayedResponse, true, latency);
+
+        if (!delayedResponse)
+            translation->finish(fault, req, tc, mode);
+    }
+
+    Walker*
+    GpuTLB::getWalker()
+    {
+        return walker;
+    }
+
+
+    void
+    GpuTLB::serialize(CheckpointOut &cp) const
+    {
+    }
+
+    void
+    GpuTLB::unserialize(CheckpointIn &cp)
+    {
+    }
+
+    void
+    GpuTLB::regStats()
+    {
+        localNumTLBAccesses
+            .name(name() + ".local_TLB_accesses")
+            .desc("Number of TLB accesses")
+            ;
+
+        localNumTLBHits
+            .name(name() + ".local_TLB_hits")
+            .desc("Number of TLB hits")
+            ;
+
+        localNumTLBMisses
+            .name(name() + ".local_TLB_misses")
+            .desc("Number of TLB misses")
+            ;
+
+        localTLBMissRate
+            .name(name() + ".local_TLB_miss_rate")
+            .desc("TLB miss rate")
+            ;
+
+        accessCycles
+            .name(name() + ".access_cycles")
+            .desc("Cycles spent accessing this TLB level")
+            ;
+
+        pageTableCycles
+            .name(name() + ".page_table_cycles")
+            .desc("Cycles spent accessing the page table")
+            ;
+
+        localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
+
+        numUniquePages
+            .name(name() + ".unique_pages")
+            .desc("Number of unique pages touched")
+            ;
+
+        localCycles
+            .name(name() + ".local_cycles")
+            .desc("Number of cycles spent in queue for all incoming reqs")
+            ;
+
+        localLatency
+            .name(name() + ".local_latency")
+            .desc("Avg. latency over incoming coalesced reqs")
+            ;
+
+        localLatency = localCycles / localNumTLBAccesses;
+
+        globalNumTLBAccesses
+            .name(name() + ".global_TLB_accesses")
+            .desc("Number of TLB accesses")
+            ;
+
+        globalNumTLBHits
+            .name(name() + ".global_TLB_hits")
+            .desc("Number of TLB hits")
+            ;
+
+        globalNumTLBMisses
+            .name(name() + ".global_TLB_misses")
+            .desc("Number of TLB misses")
+            ;
+
+        globalTLBMissRate
+            .name(name() + ".global_TLB_miss_rate")
+            .desc("TLB miss rate")
+            ;
+
+        globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
+
+        avgReuseDistance
+            .name(name() + ".avg_reuse_distance")
+            .desc("avg. reuse distance over all pages (in ticks)")
+            ;
+
+    }
+
+    /**
+     * Do the TLB lookup for this coalesced request and schedule
+     * another event <TLB access latency> cycles later.
+     */
+
+    void
+    GpuTLB::issueTLBLookup(PacketPtr pkt)
+    {
+        assert(pkt);
+        assert(pkt->senderState);
+
+        Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
+                                        TheISA::PageBytes);
+
+        TranslationState *sender_state =
+                safe_cast<TranslationState*>(pkt->senderState);
+
+        bool update_stats = !sender_state->prefetch;
+        ThreadContext * tmp_tc = sender_state->tc;
+
+        DPRINTF(GPUTLB, "Translation req. for virt. page addr %#x\n",
+                virt_page_addr);
+
+        int req_cnt = sender_state->reqCnt.back();
+
+        if (update_stats) {
+            accessCycles -= (curTick() * req_cnt);
+            localCycles -= curTick();
+            updatePageFootprint(virt_page_addr);
+            globalNumTLBAccesses += req_cnt;
+        }
+
+        tlbOutcome lookup_outcome = TLB_MISS;
+        RequestPtr tmp_req = pkt->req;
+
+        // Access the TLB and figure out if it's a hit or a miss.
+        bool success = tlbLookup(tmp_req, tmp_tc, update_stats);
+
+        if (success) {
+            lookup_outcome = TLB_HIT;
+            // Put the entry in SenderState
+            GpuTlbEntry *entry = lookup(tmp_req->getVaddr(), false);
+            assert(entry);
+
+            sender_state->tlbEntry =
+                new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid);
+
+            if (update_stats) {
+                // the reqCnt has an entry per level, so its size tells us
+                // which level we are in
+                sender_state->hitLevel = sender_state->reqCnt.size();
+                globalNumTLBHits += req_cnt;
+            }
+        } else {
+            if (update_stats)
+                globalNumTLBMisses += req_cnt;
+        }
+
+        /*
+         * We now know the TLB lookup outcome (if it's a hit or a miss), as well
+         * as the TLB access latency.
+         *
+         * We create and schedule a new TLBEvent which will help us take the
+         * appropriate actions (e.g., update TLB on a hit, send request to lower
+         * level TLB on a miss, or start a page walk if this was the last-level
+         * TLB)
+         */
+        TLBEvent *tlb_event =
+            new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);
+
+        if (translationReturnEvent.count(virt_page_addr)) {
+            panic("Virtual Page Address %#x already has a return event\n",
+                  virt_page_addr);
+        }
+
+        translationReturnEvent[virt_page_addr] = tlb_event;
+        assert(tlb_event);
+
+        DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",
+                curTick() + this->ticks(hitLatency));
+
+        schedule(tlb_event, curTick() + this->ticks(hitLatency));
+    }
+
+    GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome,
+                               PacketPtr _pkt)
+        : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
+        outcome(tlb_outcome), pkt(_pkt)
+    {
+    }
+
+    /**
+     * Do Paging protection checks. If we encounter a page fault, then
+     * an assertion is fired.
+     */
+    void
+    GpuTLB::pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
+            GpuTlbEntry * tlb_entry, Mode mode)
+    {
+        HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
+        uint32_t flags = pkt->req->getFlags();
+        bool storeCheck = flags & (StoreCheck << FlagShift);
+
+        // Do paging protection checks.
+        bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
+        CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
+
+        bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp));
+
+        if ((inUser && !tlb_entry->user) ||
+            (mode == BaseTLB::Write && badWrite)) {
+           // The page must have been present to get into the TLB in
+           // the first place. We'll assume the reserved bits are
+           // fine even though we're not checking them.
+           assert(false);
+        }
+
+        if (storeCheck && badWrite) {
+           // This would fault if this were a write, so return a page
+           // fault that reflects that happening.
+           assert(false);
+        }
+    }
+
+    /**
+     * handleTranslationReturn is called on a TLB hit,
+     * when a TLB miss returns or when a page fault returns.
+     * The latter calls handelHit with TLB miss as tlbOutcome.
+     */
+    void
+    GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome,
+            PacketPtr pkt)
+    {
+
+        assert(pkt);
+        Addr vaddr = pkt->req->getVaddr();
+
+        TranslationState *sender_state =
+            safe_cast<TranslationState*>(pkt->senderState);
+
+        ThreadContext *tc = sender_state->tc;
+        Mode mode = sender_state->tlbMode;
+
+        GpuTlbEntry *local_entry, *new_entry;
+
+        if (tlb_outcome == TLB_HIT) {
+            DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr);
+            local_entry = sender_state->tlbEntry;
+        } else {
+            DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
+                    vaddr);
+
+            // We are returning either from a page walk or from a hit at a lower
+            // TLB level. The senderState should be "carrying" a pointer to the
+            // correct TLBEntry.
+            new_entry = sender_state->tlbEntry;
+            assert(new_entry);
+            local_entry = new_entry;
+
+            if (allocationPolicy) {
+                DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
+                        virt_page_addr);
+
+                local_entry = insert(virt_page_addr, *new_entry);
+            }
+
+            assert(local_entry);
+        }
+
+        /**
+         * At this point the packet carries an up-to-date tlbEntry pointer
+         * in its senderState.
+         * Next step is to do the paging protection checks.
+         */
+        DPRINTF(GPUTLB, "Entry found with vaddr %#x,  doing protection checks "
+                "while paddr was %#x.\n", local_entry->vaddr,
+                local_entry->paddr);
+
+        pagingProtectionChecks(tc, pkt, local_entry, mode);
+        int page_size = local_entry->size();
+        Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
+        DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
+
+        // Since this packet will be sent through the cpu side slave port,
+        // it must be converted to a response pkt if it is not one already
+        if (pkt->isRequest()) {
+            pkt->makeTimingResponse();
+        }
+
+        pkt->req->setPaddr(paddr);
+
+        if (local_entry->uncacheable) {
+             pkt->req->setFlags(Request::UNCACHEABLE);
+        }
+
+        //send packet back to coalescer
+        cpuSidePort[0]->sendTimingResp(pkt);
+        //schedule cleanup event
+        cleanupQueue.push(virt_page_addr);
+
+        // schedule this only once per cycle.
+        // The check is required because we might have multiple translations
+        // returning the same cycle
+        // this is a maximum priority event and must be on the same cycle
+        // as the cleanup event in TLBCoalescer to avoid a race with
+        // IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry
+        if (!cleanupEvent.scheduled())
+            schedule(cleanupEvent, curTick());
+    }
+
+    /**
+     * Here we take the appropriate actions based on the result of the
+     * TLB lookup.
+     */
+    void
+    GpuTLB::translationReturn(Addr virtPageAddr, tlbOutcome outcome,
+                              PacketPtr pkt)
+    {
+        DPRINTF(GPUTLB, "Triggered TLBEvent for addr %#x\n", virtPageAddr);
+
+        assert(translationReturnEvent[virtPageAddr]);
+        assert(pkt);
+
+        TranslationState *tmp_sender_state =
+            safe_cast<TranslationState*>(pkt->senderState);
+
+        int req_cnt = tmp_sender_state->reqCnt.back();
+        bool update_stats = !tmp_sender_state->prefetch;
+
+
+        if (outcome == TLB_HIT) {
+            handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);
+
+            if (update_stats) {
+                accessCycles += (req_cnt * curTick());
+                localCycles += curTick();
+            }
+
+        } else if (outcome == TLB_MISS) {
+
+            DPRINTF(GPUTLB, "This is a TLB miss\n");
+            if (update_stats) {
+                accessCycles += (req_cnt*curTick());
+                localCycles += curTick();
+            }
+
+            if (hasMemSidePort) {
+                // the one cyle added here represent the delay from when we get
+                // the reply back till when we propagate it to the coalescer
+                // above.
+                if (update_stats) {
+                    accessCycles += (req_cnt * 1);
+                    localCycles += 1;
+                }
+
+                /**
+                 * There is a TLB below. Send the coalesced request.
+                 * We actually send the very first packet of all the
+                 * pending packets for this virtual page address.
+                 */
+                if (!memSidePort[0]->sendTimingReq(pkt)) {
+                    DPRINTF(GPUTLB, "Failed sending translation request to "
+                            "lower level TLB for addr %#x\n", virtPageAddr);
+
+                    memSidePort[0]->retries.push_back(pkt);
+                } else {
+                    DPRINTF(GPUTLB, "Sent translation request to lower level "
+                            "TLB for addr %#x\n", virtPageAddr);
+                }
+            } else {
+                //this is the last level TLB. Start a page walk
+                DPRINTF(GPUTLB, "Last level TLB - start a page walk for "
+                        "addr %#x\n", virtPageAddr);
+
+                if (update_stats)
+                    pageTableCycles -= (req_cnt*curTick());
+
+                TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
+                assert(tlb_event);
+                tlb_event->updateOutcome(PAGE_WALK);
+                schedule(tlb_event, curTick() + ticks(missLatency2));
+            }
+        } else if (outcome == PAGE_WALK) {
+            if (update_stats)
+                pageTableCycles += (req_cnt*curTick());
+
+            // Need to access the page table and update the TLB
+            DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
+                    virtPageAddr);
+
+            TranslationState *sender_state =
+                safe_cast<TranslationState*>(pkt->senderState);
+
+            Process *p = sender_state->tc->getProcessPtr();
+            TlbEntry newEntry;
+            Addr vaddr = pkt->req->getVaddr();
+    #ifndef NDEBUG
+            Addr alignedVaddr = p->pTable->pageAlign(vaddr);
+            assert(alignedVaddr == virtPageAddr);
+    #endif
+            bool success;
+            success = p->pTable->lookup(vaddr, newEntry);
+            if (!success && sender_state->tlbMode != BaseTLB::Execute) {
+                if (p->fixupStackFault(vaddr)) {
+                    success = p->pTable->lookup(vaddr, newEntry);
+                }
+            }
+
+            DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
+                    newEntry.pageStart());
+
+            sender_state->tlbEntry =
+                new GpuTlbEntry(0, newEntry.vaddr, newEntry.paddr, success);
+
+            handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
+        } else if (outcome == MISS_RETURN) {
+            /** we add an extra cycle in the return path of the translation
+             * requests in between the various TLB levels.
+             */
+            handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
+        } else {
+            assert(false);
+        }
+    }
+
+    void
+    GpuTLB::TLBEvent::process()
+    {
+        tlb->translationReturn(virtPageAddr, outcome, pkt);
+    }
+
+    const char*
+    GpuTLB::TLBEvent::description() const
+    {
+        return "trigger translationDoneEvent";
+    }
+
+    void
+    GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome)
+    {
+        outcome = _outcome;
+    }
+
+    Addr
+    GpuTLB::TLBEvent::getTLBEventVaddr()
+    {
+        return virtPageAddr;
+    }
+
+    /*
+     * recvTiming receives a coalesced timing request from a TLBCoalescer
+     * and it calls issueTLBLookup()
+     * It only rejects the packet if we have exceeded the max
+     * outstanding number of requests for the TLB
+     */
+    bool
+    GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt)
+    {
+        if (tlb->outstandingReqs < tlb->maxCoalescedReqs) {
+            tlb->issueTLBLookup(pkt);
+            // update number of outstanding translation requests
+            tlb->outstandingReqs++;
+            return true;
+         } else {
+            DPRINTF(GPUTLB, "Reached maxCoalescedReqs number %d\n",
+                    tlb->outstandingReqs);
+            return false;
+         }
+    }
+
+    /**
+     * handleFuncTranslationReturn is called on a TLB hit,
+     * when a TLB miss returns or when a page fault returns.
+     * It updates LRU, inserts the TLB entry on a miss
+     * depending on the allocation policy and does the required
+     * protection checks. It does NOT create a new packet to
+     * update the packet's addr; this is done in hsail-gpu code.
+     */
+    void
+    GpuTLB::handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome tlb_outcome)
+    {
+        TranslationState *sender_state =
+            safe_cast<TranslationState*>(pkt->senderState);
+
+        ThreadContext *tc = sender_state->tc;
+        Mode mode = sender_state->tlbMode;
+        Addr vaddr = pkt->req->getVaddr();
+
+        GpuTlbEntry *local_entry, *new_entry;
+
+        if (tlb_outcome == TLB_HIT) {
+            DPRINTF(GPUTLB, "Functional Translation Done - TLB hit for addr "
+                    "%#x\n", vaddr);
+
+            local_entry = sender_state->tlbEntry;
+        } else {
+            DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
+                    "%#x\n", vaddr);
+
+            // We are returning either from a page walk or from a hit at a lower
+            // TLB level. The senderState should be "carrying" a pointer to the
+            // correct TLBEntry.
+            new_entry = sender_state->tlbEntry;
+            assert(new_entry);
+            local_entry = new_entry;
+
+            if (allocationPolicy) {
+                Addr virt_page_addr = roundDown(vaddr, TheISA::PageBytes);
+
+                DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
+                        virt_page_addr);
+
+                local_entry = insert(virt_page_addr, *new_entry);
+            }
+
+            assert(local_entry);
+        }
+
+        DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
+                "while paddr was %#x.\n", local_entry->vaddr,
+                local_entry->paddr);
+
+        // Do paging checks if it's a normal functional access.  If it's for a
+        // prefetch, then sometimes you can try to prefetch something that won't
+        // pass protection. We don't actually want to fault becuase there is no
+        // demand access to deem this a violation.  Just put it in the TLB and
+        // it will fault if indeed a future demand access touches it in
+        // violation.
+        if (!sender_state->prefetch && sender_state->tlbEntry->valid)
+            pagingProtectionChecks(tc, pkt, local_entry, mode);
+
+        int page_size = local_entry->size();
+        Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
+        DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
+
+        pkt->req->setPaddr(paddr);
+
+        if (local_entry->uncacheable)
+             pkt->req->setFlags(Request::UNCACHEABLE);
+    }
+
+    // This is used for atomic translations. Need to
+    // make it all happen during the same cycle.
+    void
+    GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt)
+    {
+        TranslationState *sender_state =
+            safe_cast<TranslationState*>(pkt->senderState);
+
+        ThreadContext *tc = sender_state->tc;
+        bool update_stats = !sender_state->prefetch;
+
+        Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
+                                        TheISA::PageBytes);
+
+        if (update_stats)
+            tlb->updatePageFootprint(virt_page_addr);
+
+        // do the TLB lookup without updating the stats
+        bool success = tlb->tlbLookup(pkt->req, tc, update_stats);
+        tlbOutcome tlb_outcome = success ? TLB_HIT : TLB_MISS;
+
+        // functional mode means no coalescing
+        // global metrics are the same as the local metrics
+        if (update_stats) {
+            tlb->globalNumTLBAccesses++;
+
+            if (success) {
+                sender_state->hitLevel = sender_state->reqCnt.size();
+                tlb->globalNumTLBHits++;
+            }
+        }
+
+        if (!success) {
+            if (update_stats)
+                tlb->globalNumTLBMisses++;
+            if (tlb->hasMemSidePort) {
+                // there is a TLB below -> propagate down the TLB hierarchy
+                tlb->memSidePort[0]->sendFunctional(pkt);
+                // If no valid translation from a prefetch, then just return
+                if (sender_state->prefetch && !pkt->req->hasPaddr())
+                    return;
+            } else {
+                // Need to access the page table and update the TLB
+                DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
+                        virt_page_addr);
+
+                Process *p = tc->getProcessPtr();
+                TlbEntry newEntry;
+
+                Addr vaddr = pkt->req->getVaddr();
+    #ifndef NDEBUG
+                Addr alignedVaddr = p->pTable->pageAlign(vaddr);
+                assert(alignedVaddr == virt_page_addr);
+    #endif
+
+                bool success = p->pTable->lookup(vaddr, newEntry);
+                if (!success && sender_state->tlbMode != BaseTLB::Execute) {
+                    if (p->fixupStackFault(vaddr))
+                        success = p->pTable->lookup(vaddr, newEntry);
+                }
+
+                if (!sender_state->prefetch) {
+                    // no PageFaults are permitted after
+                    // the second page table lookup
+                    assert(success);
+
+                    DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
+                           newEntry.pageStart());
+
+                    sender_state->tlbEntry = new GpuTlbEntry(0, newEntry.vaddr,
+                                                             newEntry.paddr,
+                                                             success);
+                } else {
+                    // If this was a prefetch, then do the normal thing if it
+                    // was a successful translation.  Otherwise, send an empty
+                    // TLB entry back so that it can be figured out as empty and
+                    // handled accordingly.
+                    if (success) {
+                        DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
+                               newEntry.pageStart());
+
+                        sender_state->tlbEntry = new GpuTlbEntry(0,
+                                                                 newEntry.vaddr,
+                                                                 newEntry.paddr,
+                                                                 success);
+                    } else {
+                        DPRINTF(GPUPrefetch, "Prefetch failed %#x\n",
+                                alignedVaddr);
+
+                        sender_state->tlbEntry = new GpuTlbEntry();
+
+                        return;
+                    }
+                }
+            }
+        } else {
+            DPRINTF(GPUPrefetch, "Functional Hit for vaddr %#x\n",
+                    tlb->lookup(pkt->req->getVaddr()));
+
+            GpuTlbEntry *entry = tlb->lookup(pkt->req->getVaddr(),
+                                             update_stats);
+
+            assert(entry);
+
+            sender_state->tlbEntry =
+                new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid);
+        }
+        // This is the function that would populate pkt->req with the paddr of
+        // the translation. But if no translation happens (i.e Prefetch fails)
+        // then the early returns in the above code wiill keep this function
+        // from executing.
+        tlb->handleFuncTranslationReturn(pkt, tlb_outcome);
+    }
+
+    void
+    GpuTLB::CpuSidePort::recvReqRetry()
+    {
+        // The CPUSidePort never sends anything but replies. No retries
+        // expected.
+        assert(false);
+    }
+
+    AddrRangeList
+    GpuTLB::CpuSidePort::getAddrRanges() const
+    {
+        // currently not checked by the master
+        AddrRangeList ranges;
+
+        return ranges;
+    }
+
+    /**
+     * MemSidePort receives the packet back.
+     * We need to call the handleTranslationReturn
+     * and propagate up the hierarchy.
+     */
+    bool
+    GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt)
+    {
+        Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
+                                        TheISA::PageBytes);
+
+        DPRINTF(GPUTLB, "MemSidePort recvTiming for virt_page_addr %#x\n",
+                virt_page_addr);
+
+        TLBEvent *tlb_event = tlb->translationReturnEvent[virt_page_addr];
+        assert(tlb_event);
+        assert(virt_page_addr == tlb_event->getTLBEventVaddr());
+
+        tlb_event->updateOutcome(MISS_RETURN);
+        tlb->schedule(tlb_event, curTick()+tlb->ticks(1));
+
+        return true;
+    }
+
+    void
+    GpuTLB::MemSidePort::recvReqRetry()
+    {
+        // No retries should reach the TLB. The retries
+        // should only reach the TLBCoalescer.
+        assert(false);
+    }
+
+    void
+    GpuTLB::cleanup()
+    {
+        while (!cleanupQueue.empty()) {
+            Addr cleanup_addr = cleanupQueue.front();
+            cleanupQueue.pop();
+
+            // delete TLBEvent
+            TLBEvent * old_tlb_event = translationReturnEvent[cleanup_addr];
+            delete old_tlb_event;
+            translationReturnEvent.erase(cleanup_addr);
+
+            // update number of outstanding requests
+            outstandingReqs--;
+        }
+
+        /** the higher level coalescer should retry if it has
+         * any pending requests.
+         */
+        for (int i = 0; i < cpuSidePort.size(); ++i) {
+            cpuSidePort[i]->sendRetryReq();
+        }
+    }
+
+    void
+    GpuTLB::updatePageFootprint(Addr virt_page_addr)
+    {
+
+        std::pair<AccessPatternTable::iterator, bool> ret;
+
+        AccessInfo tmp_access_info;
+        tmp_access_info.lastTimeAccessed = 0;
+        tmp_access_info.accessesPerPage = 0;
+        tmp_access_info.totalReuseDistance = 0;
+        tmp_access_info.sumDistance = 0;
+        tmp_access_info.meanDistance = 0;
+
+        ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr,
+                                  tmp_access_info));
+
+        bool first_page_access = ret.second;
+
+        if (first_page_access) {
+            numUniquePages++;
+        } else  {
+            int accessed_before;
+            accessed_before  = curTick() - ret.first->second.lastTimeAccessed;
+            ret.first->second.totalReuseDistance += accessed_before;
+        }
+
+        ret.first->second.accessesPerPage++;
+        ret.first->second.lastTimeAccessed = curTick();
+
+        if (accessDistance) {
+            ret.first->second.localTLBAccesses
+                .push_back(localNumTLBAccesses.value());
+        }
+    }
+
+    void
+    GpuTLB::exitCallback()
+    {
+        std::ostream *page_stat_file = nullptr;
+
+        if (accessDistance) {
+
+            // print per page statistics to a separate file (.csv format)
+            // simout is the gem5 output directory (default is m5out or the one
+            // specified with -d
+            page_stat_file = simout.create(name().c_str());
+
+            // print header
+            *page_stat_file << "page,max_access_distance,mean_access_distance, "
+                            << "stddev_distance" << std::endl;
+        }
+
+        // update avg. reuse distance footprint
+        AccessPatternTable::iterator iter, iter_begin, iter_end;
+        unsigned int sum_avg_reuse_distance_per_page = 0;
+
+        // iterate through all pages seen by this TLB
+        for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) {
+            sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance /
+                                               iter->second.accessesPerPage;
+
+            if (accessDistance) {
+                unsigned int tmp = iter->second.localTLBAccesses[0];
+                unsigned int prev = tmp;
+
+                for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
+                    if (i) {
+                        tmp = prev + 1;
+                    }
+
+                    prev = iter->second.localTLBAccesses[i];
+                    // update the localTLBAccesses value
+                    // with the actual differece
+                    iter->second.localTLBAccesses[i] -= tmp;
+                    // compute the sum of AccessDistance per page
+                    // used later for mean
+                    iter->second.sumDistance +=
+                        iter->second.localTLBAccesses[i];
+                }
+
+                iter->second.meanDistance =
+                    iter->second.sumDistance / iter->second.accessesPerPage;
+
+                // compute std_dev and max  (we need a second round because we
+                // need to know the mean value
+                unsigned int max_distance = 0;
+                unsigned int stddev_distance = 0;
+
+                for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
+                    unsigned int tmp_access_distance =
+                        iter->second.localTLBAccesses[i];
+
+                    if (tmp_access_distance > max_distance) {
+                        max_distance = tmp_access_distance;
+                    }
+
+                    unsigned int diff =
+                        tmp_access_distance - iter->second.meanDistance;
+                    stddev_distance += pow(diff, 2);
+
+                }
+
+                stddev_distance =
+                    sqrt(stddev_distance/iter->second.accessesPerPage);
+
+                if (page_stat_file) {
+                    *page_stat_file << std::hex << iter->first << ",";
+                    *page_stat_file << std::dec << max_distance << ",";
+                    *page_stat_file << std::dec << iter->second.meanDistance
+                                    << ",";
+                    *page_stat_file << std::dec << stddev_distance;
+                    *page_stat_file << std::endl;
+                }
+
+                // erase the localTLBAccesses array
+                iter->second.localTLBAccesses.clear();
+            }
+        }
+
+        if (!TLBFootprint.empty()) {
+            avgReuseDistance =
+                sum_avg_reuse_distance_per_page / TLBFootprint.size();
+        }
+
+        //clear the TLBFootprint map
+        TLBFootprint.clear();
+    }
+} // namespace X86ISA
+
+X86ISA::GpuTLB*
+X86GPUTLBParams::create()
+{
+    return new X86ISA::GpuTLB(this);
+}
+
diff --git a/src/gpu-compute/gpu_tlb.hh b/src/gpu-compute/gpu_tlb.hh
new file mode 100644
index 000000000..3549c598b
--- /dev/null
+++ b/src/gpu-compute/gpu_tlb.hh
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#ifndef __GPU_TLB_HH__
+#define __GPU_TLB_HH__
+
+#include <fstream>
+#include <list>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "arch/generic/tlb.hh"
+#include "arch/x86/pagetable.hh"
+#include "arch/x86/pagetable_walker.hh"
+#include "arch/x86/regs/segment.hh"
+#include "base/callback.hh"
+#include "base/misc.hh"
+#include "base/statistics.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/X86GPUTLB.hh"
+#include "sim/sim_object.hh"
+
+class BaseTLB;
+class Packet;
+class ThreadContext;
+
+namespace X86ISA
+{
+    class GpuTlbEntry : public TlbEntry
+    {
+      public:
+        GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid)
+          : TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { }
+
+        GpuTlbEntry() : TlbEntry() { }
+
+        bool valid;
+    };
+
+    class GpuTLB : public MemObject
+    {
+      protected:
+        friend class Walker;
+
+        typedef std::list<GpuTlbEntry*> EntryList;
+
+        uint32_t configAddress;
+
+        // TLB clock: will inherit clock from shader's clock period in terms
+        // of nuber of ticks of curTime (aka global simulation clock)
+        // The assignment of TLB clock from shader clock is done in the python
+        // config files.
+        int clock;
+
+      public:
+        // clock related functions ; maps to-and-from Simulation ticks and
+        // object clocks.
+        Tick frequency() const { return SimClock::Frequency / clock; }
+
+        Tick
+        ticks(int numCycles) const
+        {
+            return (Tick)clock * numCycles;
+        }
+
+        Tick curCycle() const { return curTick() / clock; }
+        Tick tickToCycles(Tick val) const { return val / clock;}
+
+        typedef X86GPUTLBParams Params;
+        GpuTLB(const Params *p);
+        ~GpuTLB();
+
+        typedef enum BaseTLB::Mode Mode;
+
+        class Translation
+        {
+          public:
+            virtual ~Translation() { }
+
+            /**
+             * Signal that the translation has been delayed due to a hw page
+             * table walk.
+             */
+            virtual void markDelayed() = 0;
+
+            /**
+             * The memory for this object may be dynamically allocated, and it
+             * may be responsible for cleaning itslef up which will happen in
+             * this function. Once it's called the object is no longer valid.
+             */
+            virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc,
+                    Mode mode) = 0;
+        };
+
+        void dumpAll();
+        GpuTlbEntry *lookup(Addr va, bool update_lru=true);
+        void setConfigAddress(uint32_t addr);
+
+      protected:
+        EntryList::iterator lookupIt(Addr va, bool update_lru=true);
+        Walker *walker;
+
+      public:
+        Walker *getWalker();
+        void invalidateAll();
+        void invalidateNonGlobal();
+        void demapPage(Addr va, uint64_t asn);
+
+      protected:
+        int size;
+        int assoc;
+        int numSets;
+
+        /**
+         *  true if this is a fully-associative TLB
+         */
+        bool FA;
+        Addr setMask;
+
+        /**
+         * Allocation Policy: true if we always allocate on a hit, false
+         * otherwise. Default is true.
+         */
+        bool allocationPolicy;
+
+        /**
+         * if true, then this is not the last level TLB
+         */
+        bool hasMemSidePort;
+
+        /**
+         * Print out accessDistance stats. One stat file
+         * per TLB.
+         */
+        bool accessDistance;
+
+        GpuTlbEntry *tlb;
+
+        /*
+         * It's a per-set list. As long as we have not reached
+         * the full capacity of the given set, grab an entry from
+         * the freeList.
+         */
+        std::vector<EntryList> freeList;
+
+        /**
+         * An entryList per set is the equivalent of an LRU stack;
+         * it's used to guide replacement decisions. The head of the list
+         * contains the MRU TLB entry of the given set. If the freeList
+         * for this set is empty, the last element of the list
+         * is evicted (i.e., dropped on the floor).
+         */
+        std::vector<EntryList> entryList;
+
+        Fault translateInt(RequestPtr req, ThreadContext *tc);
+
+        Fault translate(RequestPtr req, ThreadContext *tc,
+                Translation *translation, Mode mode, bool &delayedResponse,
+                bool timing, int &latency);
+
+      public:
+        // latencies for a TLB hit, miss and page fault
+        int hitLatency;
+        int missLatency1;
+        int missLatency2;
+
+        // local_stats are as seen from the TLB
+        // without taking into account coalescing
+        Stats::Scalar localNumTLBAccesses;
+        Stats::Scalar localNumTLBHits;
+        Stats::Scalar localNumTLBMisses;
+        Stats::Formula localTLBMissRate;
+
+        // global_stats are as seen from the
+        // CU's perspective taking into account
+        // all coalesced requests.
+        Stats::Scalar globalNumTLBAccesses;
+        Stats::Scalar globalNumTLBHits;
+        Stats::Scalar globalNumTLBMisses;
+        Stats::Formula globalTLBMissRate;
+
+        // from the CU perspective (global)
+        Stats::Scalar accessCycles;
+        // from the CU perspective (global)
+        Stats::Scalar pageTableCycles;
+        Stats::Scalar numUniquePages;
+        // from the perspective of this TLB
+        Stats::Scalar localCycles;
+        // from the perspective of this TLB
+        Stats::Formula localLatency;
+        // I take the avg. per page and then
+        // the avg. over all pages.
+        Stats::Scalar avgReuseDistance;
+
+        void regStats();
+        void updatePageFootprint(Addr virt_page_addr);
+        void printAccessPattern();
+
+
+        Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
+                              int &latency);
+
+        void translateTiming(RequestPtr req, ThreadContext *tc,
+                             Translation *translation, Mode mode,
+                             int &latency);
+
+        Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
+        Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
+
+        GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry);
+
+        // Checkpointing
+        virtual void serialize(CheckpointOut& cp) const;
+        virtual void unserialize(CheckpointIn& cp);
+        void issueTranslation();
+        enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
+        bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats);
+
+        void handleTranslationReturn(Addr addr, tlbOutcome outcome,
+                                     PacketPtr pkt);
+
+        void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
+
+        void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
+                                    GpuTlbEntry *tlb_entry, Mode mode);
+
+        void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry,
+                                 Addr phys_page_addr);
+
+        void issueTLBLookup(PacketPtr pkt);
+
+        // CpuSidePort is the TLB Port closer to the CPU/CU side
+        class CpuSidePort : public SlavePort
+        {
+          public:
+            CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
+                        PortID _index)
+                : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
+
+          protected:
+            GpuTLB *tlb;
+            int index;
+
+            virtual bool recvTimingReq(PacketPtr pkt);
+            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+            virtual void recvFunctional(PacketPtr pkt);
+            virtual void recvRangeChange() { }
+            virtual void recvReqRetry();
+            virtual void recvRespRetry() { assert(false); }
+            virtual AddrRangeList getAddrRanges() const;
+        };
+
+        /**
+         * MemSidePort is the TLB Port closer to the memory side
+         * If this is a last level TLB then this port will not be connected.
+         *
+         * Future action item: if we ever do real page walks, then this port
+         * should be connected to a RubyPort.
+         */
+        class MemSidePort : public MasterPort
+        {
+          public:
+            MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
+                        PortID _index)
+                : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
+
+            std::deque<PacketPtr> retries;
+
+          protected:
+            GpuTLB *tlb;
+            int index;
+
+            virtual bool recvTimingResp(PacketPtr pkt);
+            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+            virtual void recvFunctional(PacketPtr pkt) { }
+            virtual void recvRangeChange() { }
+            virtual void recvReqRetry();
+        };
+
+        // TLB ports on the cpu Side
+        std::vector<CpuSidePort*> cpuSidePort;
+        // TLB ports on the memory side
+        std::vector<MemSidePort*> memSidePort;
+
+        BaseMasterPort &getMasterPort(const std::string &if_name,
+                                      PortID idx=InvalidPortID);
+
+        BaseSlavePort &getSlavePort(const std::string &if_name,
+                                    PortID idx=InvalidPortID);
+
+        /**
+         * TLB TranslationState: this currently is a somewhat bastardization of
+         * the usage of SenderState, whereby the receiver of a packet is not
+         * usually supposed to need to look at the contents of the senderState,
+         * you're really only supposed to look at what you pushed on, pop it
+         * off, and send it back.
+         *
+         * However, since there is state that we want to pass to the TLBs using
+         * the send/recv Timing/Functional/etc. APIs, which don't allow for new
+         * arguments, we need a common TLB senderState to pass between TLBs,
+         * both "forwards" and "backwards."
+         *
+         * So, basically, the rule is that any packet received by a TLB port
+         * (cpuside OR memside) must be safely castable to a TranslationState.
+         */
+
+        struct TranslationState : public Packet::SenderState
+        {
+            // TLB mode, read or write
+            Mode tlbMode;
+            // Thread context associated with this req
+            ThreadContext *tc;
+
+            /*
+            * TLB entry to be populated and passed back and filled in
+            * previous TLBs.  Equivalent to the data cache concept of
+            * "data return."
+            */
+            GpuTlbEntry *tlbEntry;
+            // Is this a TLB prefetch request?
+            bool prefetch;
+            // When was the req for this translation issued
+            uint64_t issueTime;
+            // Remember where this came from
+            std::vector<SlavePort*>ports;
+
+            // keep track of #uncoalesced reqs per packet per TLB level;
+            // reqCnt per level >= reqCnt higher level
+            std::vector<int> reqCnt;
+            // TLB level this packet hit in; 0 if it hit in the page table
+            int hitLevel;
+            Packet::SenderState *saved;
+
+            TranslationState(Mode tlb_mode, ThreadContext *_tc,
+                             bool _prefetch=false,
+                             Packet::SenderState *_saved=nullptr)
+                : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
+                  prefetch(_prefetch), issueTime(0),
+                  hitLevel(0),saved(_saved) { }
+        };
+
+        // maximum number of permitted coalesced requests per cycle
+        int maxCoalescedReqs;
+
+        // Current number of outstandings coalesced requests.
+        // Should be <= maxCoalescedReqs
+        int outstandingReqs;
+
+        /**
+         * A TLBEvent is scheduled after the TLB lookup and helps us take the
+         * appropriate actions:
+         *  (e.g., update TLB on a hit,
+         *  send request to lower level TLB on a miss,
+         *  or start a page walk if this was the last-level TLB).
+         */
+        void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
+                               PacketPtr pkt);
+
+        class TLBEvent : public Event
+        {
+            private:
+                GpuTLB *tlb;
+                Addr virtPageAddr;
+                /**
+                 * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
+                 */
+                tlbOutcome outcome;
+                PacketPtr pkt;
+
+            public:
+                TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
+                        PacketPtr _pkt);
+
+                void process();
+                const char *description() const;
+
+                // updateOutcome updates the tlbOutcome of a TLBEvent
+                void updateOutcome(tlbOutcome _outcome);
+                Addr getTLBEventVaddr();
+        };
+
+        std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
+
+        // this FIFO queue keeps track of the virt. page addresses
+        // that are pending cleanup
+        std::queue<Addr> cleanupQueue;
+
+        // the cleanupEvent is scheduled after a TLBEvent triggers in order to
+        // free memory and do the required clean-up
+        void cleanup();
+
+        EventWrapper<GpuTLB, &GpuTLB::cleanup> cleanupEvent;
+
+        /**
+         * This hash map will use the virtual page address as a key
+         * and will keep track of total number of accesses per page
+         */
+
+        struct AccessInfo
+        {
+            unsigned int lastTimeAccessed; // last access to this page
+            unsigned int accessesPerPage;
+            // need to divide it by accessesPerPage at the end
+            unsigned int totalReuseDistance;
+
+            /**
+             * The field below will help us compute the access distance,
+             * that is the number of (coalesced) TLB accesses that
+             * happened in between each access to this page
+             *
+             * localTLBAccesses[x] is the value of localTLBNumAccesses
+             * when the page <Addr> was accessed for the <x>th time
+             */
+            std::vector<unsigned int> localTLBAccesses;
+            unsigned int sumDistance;
+            unsigned int meanDistance;
+        };
+
+        typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
+        AccessPatternTable TLBFootprint;
+
+        // Called at the end of simulation to dump page access stats.
+        void exitCallback();
+
+        EventWrapper<GpuTLB, &GpuTLB::exitCallback> exitEvent;
+    };
+}
+
+#endif // __GPU_TLB_HH__
diff --git a/src/gpu-compute/hsa_code.hh b/src/gpu-compute/hsa_code.hh
new file mode 100644
index 000000000..9f358e23c
--- /dev/null
+++ b/src/gpu-compute/hsa_code.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __HSA_CODE_HH__
+#define __HSA_CODE_HH__
+
+#include <string>
+#include <vector>
+
+#include "arch/gpu_types.hh"
+#include "config/the_gpu_isa.hh"
+
+class HsaKernelInfo;
+
+/* @class HsaCode
+ * base code object for the set of HSA kernels associated
+ * with a single application. this class provides the common
+ * methods for creating, accessing, and storing information
+ * about kernel and variable symbols, symbol name, memory
+ * segment sizes, and instruction count, etc.
+ */
+
+class HsaCode
+{
+  public:
+    HsaCode(const std::string &name) : readonly_data(nullptr), funcarg_size(0),
+                                       _name(name)
+    {
+    }
+
+    enum class MemorySegment {
+        NONE,
+        FLAT,
+        GLOBAL,
+        READONLY,
+        KERNARG,
+        GROUP,
+        PRIVATE,
+        SPILL,
+        ARG,
+        EXTSPACE0
+    };
+
+    const std::string& name() const { return _name; }
+    int numInsts() const { return _insts.size(); }
+    std::vector<TheGpuISA::RawMachInst>* insts() { return &_insts; }
+
+    void
+    setReadonlyData(uint8_t *_readonly_data)
+    {
+        readonly_data = _readonly_data;
+    }
+
+    virtual int getSize(MemorySegment segment) const = 0;
+    virtual void generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const = 0;
+
+    uint8_t *readonly_data;
+    int funcarg_size;
+
+  protected:
+    // An array that stores instruction indices (0 through kernel size)
+    // for a kernel passed to code object constructor as an argument.
+    std::vector<TheGpuISA::RawMachInst> _insts;
+
+  private:
+    const std::string _name;
+};
+
+#endif // __HSA_CODE_HH__
diff --git a/src/gpu-compute/hsa_kernel_info.hh b/src/gpu-compute/hsa_kernel_info.hh
new file mode 100644
index 000000000..396913dac
--- /dev/null
+++ b/src/gpu-compute/hsa_kernel_info.hh
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __HSA_KERNEL_INFO_HH__
+#define __HSA_KERNEL_INFO_HH__
+
+// This file defines the public interface between the HSA emulated
+// driver and application programs.
+
+#include <cstdint>
+
+static const int HSA_GET_SIZES = 0x4801;
+static const int HSA_GET_KINFO = 0x4802;
+static const int HSA_GET_STRINGS = 0x4803;
+static const int HSA_GET_CODE = 0x4804;
+static const int HSA_GET_READONLY_DATA = 0x4805;
+static const int HSA_GET_CU_CNT = 0x4806;
+static const int HSA_GET_VSZ = 0x4807;
+
+// Return value (via buffer ptr) for HSA_GET_SIZES
+struct HsaDriverSizes
+{
+    uint32_t num_kernels;
+    uint32_t string_table_size;
+    uint32_t code_size;
+    uint32_t readonly_size;
+};
+
+// HSA_GET_KINFO returns an array of num_kernels of these structs
+struct HsaKernelInfo
+{
+    // byte offset into string table
+    uint32_t name_offs;
+    // byte offset into code array
+    uint32_t code_offs;
+    uint32_t static_lds_size;
+    uint32_t private_mem_size;
+    uint32_t spill_mem_size;
+    // Number of s registers
+    uint32_t sRegCount;
+    // Number of d registers
+    uint32_t dRegCount;
+    // Number of c registers
+    uint32_t cRegCount;
+};
+
+#endif // __HSA_KERNEL_INFO_HH__
diff --git a/src/gpu-compute/hsa_object.cc b/src/gpu-compute/hsa_object.cc
new file mode 100644
index 000000000..91dfb160e
--- /dev/null
+++ b/src/gpu-compute/hsa_object.cc
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/hsa_object.hh"
+
+#include <fstream>
+
+#include "gpu-compute/brig_object.hh"
+
+HsaObject::HsaObject(const std::string &fname)
+    : readonlyData(nullptr), filename(fname)
+{
+}
+
+HsaObject*
+HsaObject::createHsaObject(const std::string &fname)
+{
+    HsaObject *hsaObj = nullptr;
+    uint8_t *file_data = nullptr;
+    int file_length = 0;
+
+    std::ifstream code_file(fname, std::ifstream::ate | std::ifstream::in |
+                            std::ifstream::binary);
+
+    assert(code_file.is_open());
+    assert(code_file.good());
+
+    file_length = code_file.tellg();
+    code_file.seekg(0, code_file.beg);
+    file_data = new uint8_t[file_length];
+    code_file.read((char*)file_data, file_length);
+    code_file.close();
+
+    for (const auto &tryFile : tryFileFuncs) {
+        if ((hsaObj = tryFile(fname, file_length, file_data))) {
+            return hsaObj;
+        }
+    }
+
+    delete[] file_data;
+    fatal("Unknown HSA object type for file: %s.\n", fname);
+
+    return nullptr;
+}
diff --git a/src/gpu-compute/hsa_object.hh b/src/gpu-compute/hsa_object.hh
new file mode 100644
index 000000000..1f08f5d80
--- /dev/null
+++ b/src/gpu-compute/hsa_object.hh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __HSA_OBJECT_HH__
+#define __HSA_OBJECT_HH__
+
+#include <functional>
+#include <string>
+#include <vector>
+
+class HsaCode;
+
+/* @class HsaObject
+ * base loader object for HSA kernels. this class provides
+ * the base method definitions for loading, storing, and
+ * accessing HSA kernel objects into the simulator.
+ */
+
+class HsaObject
+{
+  public:
+    HsaObject(const std::string &fileName);
+
+    static HsaObject* createHsaObject(const std::string &fname);
+    static std::vector<std::function<HsaObject*(const std::string&, int,
+                                                uint8_t*)>> tryFileFuncs;
+
+    virtual HsaCode* getKernel(const std::string &name) const = 0;
+    virtual HsaCode* getKernel(int i) const = 0;
+    virtual HsaCode* getFunction(const std::string &name) const = 0;
+    virtual int numKernels() const = 0;
+
+    const std::string& name() const { return filename; }
+
+    uint8_t *readonlyData;
+
+
+  protected:
+    const std::string filename;
+};
+
+#endif // __HSA_OBJECT_HH__
diff --git a/src/gpu-compute/hsail_code.cc b/src/gpu-compute/hsail_code.cc
new file mode 100644
index 000000000..b0ddf0161
--- /dev/null
+++ b/src/gpu-compute/hsail_code.cc
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "gpu-compute/hsail_code.hh"
+
+#include "arch/gpu_types.hh"
+#include "arch/hsail/Brig.h"
+#include "arch/hsail/operand.hh"
+#include "config/the_gpu_isa.hh"
+#include "debug/BRIG.hh"
+#include "debug/HSAILObject.hh"
+#include "gpu-compute/brig_object.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/kernel_cfg.hh"
+
+using namespace Brig;
+
+int getBrigDataTypeBytes(BrigType16_t t);
+
+HsailCode::HsailCode(const std::string &name_str)
+    : HsaCode(name_str), private_size(-1), readonly_size(-1)
+{
+}
+
+void
+HsailCode::init(const BrigDirectiveExecutable *code_dir, const BrigObject *obj,
+                StorageMap *objStorageMap)
+{
+    storageMap = objStorageMap;
+
+    // set pointer so that decoding process can find this kernel context when
+    // needed
+    obj->currentCode = this;
+
+    if (code_dir->base.kind != BRIG_KIND_DIRECTIVE_FUNCTION &&
+        code_dir->base.kind != BRIG_KIND_DIRECTIVE_KERNEL) {
+        fatal("unexpected directive kind %d inside kernel/function init\n",
+              code_dir->base.kind);
+    }
+
+    DPRINTF(HSAILObject, "Initializing code, first code block entry is: %d\n",
+            code_dir->firstCodeBlockEntry);
+
+    // clear these static vars so we can properly track the max index
+    // for this kernel
+    SRegOperand::maxRegIdx = 0;
+    DRegOperand::maxRegIdx = 0;
+    CRegOperand::maxRegIdx = 0;
+    setPrivateSize(0);
+
+    const BrigBase *entryPtr = brigNext((BrigBase*)code_dir);
+    const BrigBase *endPtr =
+        obj->getCodeSectionEntry(code_dir->nextModuleEntry);
+
+    int inst_idx = 0;
+    std::vector<GPUStaticInst*> instructions;
+    int funcarg_size_scope = 0;
+
+    // walk through instructions in code section and directives in
+    // directive section in parallel, processing directives that apply
+    // when we reach the relevant code point.
+    while (entryPtr < endPtr) {
+        switch (entryPtr->kind) {
+          case BRIG_KIND_DIRECTIVE_VARIABLE:
+           {
+                const BrigDirectiveVariable *sym =
+                    (const BrigDirectiveVariable*)entryPtr;
+
+                DPRINTF(HSAILObject,"Initializing code, directive is "
+                        "kind_variable, symbol is: %s\n",
+                        obj->getString(sym->name));
+
+                StorageElement *se = storageMap->addSymbol(sym, obj);
+
+                if (sym->segment == BRIG_SEGMENT_PRIVATE) {
+                    setPrivateSize(se->size);
+                } else { // spill
+                    funcarg_size_scope += se->size;
+                }
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_LABEL:
+            {
+                const BrigDirectiveLabel *lbl =
+                    (const BrigDirectiveLabel*)entryPtr;
+
+                DPRINTF(HSAILObject,"Initializing code, directive is "
+                        "kind_label, label is: %s \n",
+                        obj->getString(lbl->name));
+
+                labelMap.addLabel(lbl, inst_idx, obj);
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_PRAGMA:
+            {
+                DPRINTF(HSAILObject, "Initializing code, directive "
+                        "is kind_pragma\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_COMMENT:
+            {
+                DPRINTF(HSAILObject, "Initializing code, directive is "
+                        "kind_comment\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START:
+            {
+                DPRINTF(HSAILObject, "Initializing code, directive is "
+                        "kind_arg_block_start\n");
+
+                storageMap->resetOffset(BRIG_SEGMENT_ARG);
+                funcarg_size_scope = 0;
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END:
+            {
+                DPRINTF(HSAILObject, "Initializing code, directive is "
+                        "kind_arg_block_end\n");
+
+                funcarg_size = funcarg_size < funcarg_size_scope ?
+                                              funcarg_size_scope : funcarg_size;
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_END:
+            DPRINTF(HSAILObject, "Initializing code, dircetive is "
+                    "kind_end\n");
+
+            break;
+
+          default:
+            if (entryPtr->kind >= BRIG_KIND_INST_BEGIN &&
+                entryPtr->kind <= BRIG_KIND_INST_END) {
+
+                BrigInstBase *instPtr = (BrigInstBase*)entryPtr;
+                TheGpuISA::MachInst machInst = { instPtr, obj };
+                GPUStaticInst *iptr = decoder.decode(machInst);
+
+                if (iptr) {
+                    DPRINTF(HSAILObject, "Initializing code, processing inst "
+                            "#%d idx %d: OPCODE=%d\n",
+                            inst_idx,  _insts.size(), instPtr->opcode);
+
+                    TheGpuISA::RawMachInst inst_num = decoder.saveInst(iptr);
+                    iptr->instNum(inst_idx);
+                    _insts.push_back(inst_num);
+                    instructions.push_back(iptr);
+                }
+                ++inst_idx;
+            } else if (entryPtr->kind >= BRIG_KIND_OPERAND_BEGIN &&
+                       entryPtr->kind < BRIG_KIND_OPERAND_END) {
+                warn("unexpected operand entry in code segment\n");
+            } else {
+                // there are surely some more cases we will need to handle,
+                // but we'll deal with them as we find them.
+                fatal("unexpected directive kind %d inside kernel scope\n",
+                      entryPtr->kind);
+            }
+        }
+
+        entryPtr = brigNext(entryPtr);
+    }
+
+    // compute Control Flow Graph for current kernel
+    ControlFlowInfo::assignImmediatePostDominators(instructions);
+
+    max_sreg = SRegOperand::maxRegIdx;
+    max_dreg = DRegOperand::maxRegIdx;
+    max_creg = CRegOperand::maxRegIdx;
+
+    obj->currentCode = nullptr;
+}
+
+HsailCode::HsailCode(const std::string &name_str,
+                     const BrigDirectiveExecutable *code_dir,
+                     const BrigObject *obj, StorageMap *objStorageMap)
+    : HsaCode(name_str), private_size(-1), readonly_size(-1)
+{
+    init(code_dir, obj, objStorageMap);
+}
+
+void
+LabelMap::addLabel(const Brig::BrigDirectiveLabel *lblDir, int inst_index,
+                   const BrigObject *obj)
+{
+    std::string lbl_name = obj->getString(lblDir->name);
+    Label &lbl = map[lbl_name];
+
+    if (lbl.defined()) {
+        fatal("Attempt to redefine existing label %s\n", lbl_name);
+    }
+
+    lbl.define(lbl_name, inst_index);
+    DPRINTF(HSAILObject, "label %s = %d\n", lbl_name, inst_index);
+}
+
+Label*
+LabelMap::refLabel(const Brig::BrigDirectiveLabel *lblDir,
+                   const BrigObject *obj)
+{
+    std::string name = obj->getString(lblDir->name);
+    Label &lbl = map[name];
+    lbl.checkName(name);
+
+    return &lbl;
+}
+
+int
+getBrigDataTypeBytes(BrigType16_t t)
+{
+    switch (t) {
+      case BRIG_TYPE_S8:
+      case BRIG_TYPE_U8:
+      case BRIG_TYPE_B8:
+        return 1;
+
+      case BRIG_TYPE_S16:
+      case BRIG_TYPE_U16:
+      case BRIG_TYPE_B16:
+      case BRIG_TYPE_F16:
+        return 2;
+
+      case BRIG_TYPE_S32:
+      case BRIG_TYPE_U32:
+      case BRIG_TYPE_B32:
+      case BRIG_TYPE_F32:
+        return 4;
+
+      case BRIG_TYPE_S64:
+      case BRIG_TYPE_U64:
+      case BRIG_TYPE_B64:
+      case BRIG_TYPE_F64:
+        return 8;
+
+      case BRIG_TYPE_B1:
+
+      default:
+        fatal("unhandled symbol data type %d", t);
+        return 0;
+    }
+}
+
+StorageElement*
+StorageSpace::addSymbol(const BrigDirectiveVariable *sym,
+                        const BrigObject *obj)
+{
+    const char *sym_name = obj->getString(sym->name);
+    uint64_t size = 0;
+    uint64_t offset = 0;
+
+    if (sym->type & BRIG_TYPE_ARRAY) {
+        size = getBrigDataTypeBytes(sym->type & ~BRIG_TYPE_ARRAY);
+        size *= (((uint64_t)sym->dim.hi) << 32 | (uint64_t)sym->dim.lo);
+
+        offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type &
+                         ~BRIG_TYPE_ARRAY));
+    } else {
+        size = getBrigDataTypeBytes(sym->type);
+        offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type));
+    }
+
+    nextOffset = offset + size;
+
+    DPRINTF(HSAILObject, "Adding %s SYMBOL %s size %d offset 0x%x, init: %d\n",
+            segmentNames[segment], sym_name, size, offset, sym->init);
+
+    StorageElement* se = new StorageElement(sym_name, offset, size, sym);
+    elements.push_back(se);
+    elements_by_addr.insert(AddrRange(offset, offset + size - 1), se);
+    elements_by_brigptr[sym] = se;
+
+    return se;
+}
+
+StorageElement*
+StorageSpace::findSymbol(std::string name)
+{
+    for (auto it : elements) {
+        if (it->name == name) {
+            return it;
+        }
+    }
+
+    return nullptr;
+}
+
+StorageElement*
+StorageSpace::findSymbol(uint64_t addr)
+{
+    assert(elements_by_addr.size() > 0);
+
+    auto se = elements_by_addr.find(addr);
+
+    if (se == elements_by_addr.end()) {
+        return nullptr;
+    } else {
+        return se->second;
+    }
+}
+
+StorageElement*
+StorageSpace::findSymbol(const BrigDirectiveVariable *brigptr)
+{
+    assert(elements_by_brigptr.size() > 0);
+
+    auto se = elements_by_brigptr.find(brigptr);
+
+    if (se == elements_by_brigptr.end()) {
+        return nullptr;
+    } else {
+        return se->second;
+    }
+}
+
+StorageMap::StorageMap(StorageMap *outerScope)
+    : outerScopeMap(outerScope)
+{
+    for (int i = 0; i < NumSegments; ++i)
+        space[i] = new StorageSpace((BrigSegment)i);
+}
+
+StorageElement*
+StorageMap::addSymbol(const BrigDirectiveVariable *sym, const BrigObject *obj)
+{
+    BrigSegment8_t segment = sym->segment;
+
+    assert(segment >= Brig::BRIG_SEGMENT_FLAT);
+    assert(segment < NumSegments);
+
+    return space[segment]->addSymbol(sym, obj);
+}
+
+int
+StorageMap::getSize(Brig::BrigSegment segment)
+{
+    assert(segment > Brig::BRIG_SEGMENT_GLOBAL);
+    assert(segment < NumSegments);
+
+    if (segment != Brig::BRIG_SEGMENT_GROUP &&
+        segment != Brig::BRIG_SEGMENT_READONLY) {
+        return space[segment]->getSize();
+    } else {
+        int ret = space[segment]->getSize();
+
+        if (outerScopeMap) {
+            ret += outerScopeMap->getSize(segment);
+        }
+
+        return ret;
+    }
+}
+
+void
+StorageMap::resetOffset(Brig::BrigSegment segment)
+{
+    space[segment]->resetOffset();
+}
+
+StorageElement*
+StorageMap::findSymbol(BrigSegment segment, std::string name)
+{
+    StorageElement *se = space[segment]->findSymbol(name);
+
+    if (se)
+        return se;
+
+    if (outerScopeMap)
+        return outerScopeMap->findSymbol(segment, name);
+
+    return nullptr;
+}
+
+StorageElement*
+StorageMap::findSymbol(Brig::BrigSegment segment, uint64_t addr)
+{
+    StorageSpace *sp = space[segment];
+
+    if (!sp) {
+        // there is no memory in segment?
+        return nullptr;
+    }
+
+    StorageElement *se = sp->findSymbol(addr);
+
+    if (se)
+        return se;
+
+    if (outerScopeMap)
+        return outerScopeMap->findSymbol(segment, addr);
+
+    return nullptr;
+
+}
+
+StorageElement*
+StorageMap::findSymbol(Brig::BrigSegment segment,
+                       const BrigDirectiveVariable *brigptr)
+{
+    StorageSpace *sp = space[segment];
+
+    if (!sp) {
+        // there is no memory in segment?
+        return nullptr;
+    }
+
+    StorageElement *se = sp->findSymbol(brigptr);
+
+    if (se)
+        return se;
+
+    if (outerScopeMap)
+        return outerScopeMap->findSymbol(segment, brigptr);
+
+    return nullptr;
+
+}
diff --git a/src/gpu-compute/hsail_code.hh b/src/gpu-compute/hsail_code.hh
new file mode 100644
index 000000000..d9fbcc577
--- /dev/null
+++ b/src/gpu-compute/hsail_code.hh
@@ -0,0 +1,447 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __HSAIL_CODE_HH__
+#define __HSAIL_CODE_HH__
+
+#include <cassert>
+#include <list>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "arch/gpu_decoder.hh"
+#include "arch/hsail/Brig.h"
+#include "base/addr_range_map.hh"
+#include "base/intmath.hh"
+#include "config/the_gpu_isa.hh"
+#include "gpu-compute/hsa_code.hh"
+#include "gpu-compute/hsa_kernel_info.hh"
+#include "gpu-compute/misc.hh"
+
+class BrigObject;
+class GPUStaticInst;
+
+inline int
+popcount(uint64_t src, int sz)
+{
+    int cnt = 0;
+
+    for (int i = 0; i < sz; ++i) {
+        if (src & 1)
+            ++cnt;
+        src >>= 1;
+    }
+
+    return cnt;
+}
+
+inline int
+firstbit(uint64_t src, int sz)
+{
+    int i;
+
+    for (i = 0; i < sz; ++i) {
+        if (src & 1)
+            break;
+        src >>= 1;
+    }
+
+    return i;
+}
+
+inline int
+lastbit(uint64_t src, int sz)
+{
+    int i0 = -1;
+
+    for (int i = 0; i < sz; ++i) {
+        if (src & 1)
+            i0 = i;
+        src >>= 1;
+    }
+
+    return i0;
+}
+
+inline int
+signbit(uint64_t src, int sz)
+{
+    int i0 = -1;
+
+    if (src & (1 << (sz - 1))) {
+        for (int i = 0; i < sz - 1; ++i) {
+            if (!(src & 1))
+                i0 = i;
+            src >>= 1;
+        }
+    } else {
+        for (int i = 0; i < sz - 1; ++i) {
+            if (src & 1)
+                i0 = i;
+            src >>= 1;
+        }
+    }
+
+    return i0;
+}
+
+inline uint64_t
+bitrev(uint64_t src, int sz)
+{
+    uint64_t r = 0;
+
+    for (int i = 0; i < sz; ++i) {
+        r <<= 1;
+        if (src & 1)
+            r |= 1;
+        src >>= 1;
+    }
+
+    return r;
+}
+
+inline uint64_t
+mul_hi(uint32_t a, uint32_t b)
+{
+    return ((uint64_t)a * (uint64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(int32_t a, int32_t b)
+{
+    return ((int64_t)a * (int64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(uint64_t a, uint64_t b)
+{
+    return ((uint64_t)a * (uint64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(int64_t a, int64_t b)
+{
+    return ((int64_t)a * (int64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(double a, double b)
+{
+    return 0;
+}
+
+class Label
+{
+  public:
+    std::string name;
+    int value;
+
+    Label() : value(-1)
+    {
+    }
+
+    bool defined() { return value != -1; }
+
+    void
+    checkName(std::string &_name)
+    {
+        if (name.empty()) {
+            name = _name;
+        } else {
+            assert(name == _name);
+        }
+    }
+
+    void
+    define(std::string &_name, int _value)
+    {
+        assert(!defined());
+        assert(_value != -1);
+        value = _value;
+        checkName(_name);
+    }
+
+    int
+    get()
+    {
+        assert(defined());
+        return value;
+    }
+};
+
+class LabelMap
+{
+    std::map<std::string, Label> map;
+
+  public:
+    LabelMap() { }
+
+    void addLabel(const Brig::BrigDirectiveLabel *lbl, int inst_index,
+                  const BrigObject *obj);
+
+    Label *refLabel(const Brig::BrigDirectiveLabel *lbl,
+                    const BrigObject *obj);
+};
+
+const int NumSegments = Brig::BRIG_SEGMENT_AMD_GCN;
+
+extern const char *segmentNames[];
+
+class StorageElement
+{
+  public:
+    std::string name;
+    uint64_t offset;
+
+    uint64_t size;
+    const Brig::BrigDirectiveVariable *brigSymbol;
+    StorageElement(const char *_name, uint64_t _offset, int _size,
+                   const Brig::BrigDirectiveVariable *sym)
+        : name(_name), offset(_offset), size(_size), brigSymbol(sym)
+    {
+    }
+};
+
+class StorageSpace
+{
+    typedef std::map<const Brig::BrigDirectiveVariable*, StorageElement*>
+            DirVarToSE_map;
+
+    std::list<StorageElement*> elements;
+    AddrRangeMap<StorageElement*> elements_by_addr;
+    DirVarToSE_map elements_by_brigptr;
+
+    uint64_t nextOffset;
+    Brig::BrigSegment segment;
+
+  public:
+    StorageSpace(Brig::BrigSegment _class)
+        : nextOffset(0), segment(_class)
+    {
+    }
+
+    StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym,
+                              const BrigObject *obj);
+
+    StorageElement* findSymbol(std::string name);
+    StorageElement* findSymbol(uint64_t addr);
+    StorageElement* findSymbol(const Brig::BrigDirectiveVariable *brigptr);
+
+    int getSize() { return nextOffset; }
+    void resetOffset() { nextOffset = 0; }
+};
+
+class StorageMap
+{
+    StorageMap *outerScopeMap;
+    StorageSpace *space[NumSegments];
+
+  public:
+    StorageMap(StorageMap *outerScope = nullptr);
+
+    StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym,
+                              const BrigObject *obj);
+
+    StorageElement* findSymbol(Brig::BrigSegment segment, std::string name);
+    StorageElement* findSymbol(Brig::BrigSegment segment, uint64_t addr);
+
+    StorageElement* findSymbol(Brig::BrigSegment segment,
+                               const Brig::BrigDirectiveVariable *brigptr);
+
+    // overloaded version to avoid casting
+    StorageElement*
+    findSymbol(Brig::BrigSegment8_t segment, std::string name)
+    {
+        return findSymbol((Brig::BrigSegment)segment, name);
+    }
+
+    int getSize(Brig::BrigSegment segment);
+    void resetOffset(Brig::BrigSegment segment);
+};
+
+typedef enum
+{
+    BT_DEFAULT,
+    BT_B8,
+    BT_U8,
+    BT_U16,
+    BT_U32,
+    BT_U64,
+    BT_S8,
+    BT_S16,
+    BT_S32,
+    BT_S64,
+    BT_F16,
+    BT_F32,
+    BT_F64,
+    BT_NULL
+} base_type_e;
+
+/* @class HsailCode
+ * the HsailCode class is used to store information
+ * about HSA kernels stored in the BRIG format. it holds
+ * all information about a kernel, function, or variable
+ * symbol and provides methods for accessing that
+ * information.
+ */
+
+class HsailCode final : public HsaCode
+{
+  public:
+    TheGpuISA::Decoder decoder;
+
+    StorageMap *storageMap;
+    LabelMap labelMap;
+    uint32_t kernarg_start;
+    uint32_t kernarg_end;
+    int32_t private_size;
+
+    int32_t readonly_size;
+
+    // We track the maximum register index used for each register
+    // class when we load the code so we can size the register files
+    // appropriately (i.e., one more than the max index).
+    uint32_t max_creg;    // maximum c-register index
+    uint32_t max_sreg;    // maximum s-register index
+    uint32_t max_dreg;    // maximum d-register index
+
+    HsailCode(const std::string &name_str,
+              const Brig::BrigDirectiveExecutable *code_dir,
+              const BrigObject *obj,
+              StorageMap *objStorageMap);
+
+    // this version is used to create a placeholder when
+    // we encounter a kernel-related directive before the
+    // kernel itself
+    HsailCode(const std::string &name_str);
+
+    void init(const Brig::BrigDirectiveExecutable *code_dir,
+              const BrigObject *obj, StorageMap *objStorageMap);
+
+    void
+    generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const
+    {
+        hsaKernelInfo->sRegCount = max_sreg + 1;
+        hsaKernelInfo->dRegCount = max_dreg + 1;
+        hsaKernelInfo->cRegCount = max_creg + 1;
+
+        hsaKernelInfo->static_lds_size = getSize(Brig::BRIG_SEGMENT_GROUP);
+
+        hsaKernelInfo->private_mem_size =
+            roundUp(getSize(Brig::BRIG_SEGMENT_PRIVATE), 8);
+
+        hsaKernelInfo->spill_mem_size =
+            roundUp(getSize(Brig::BRIG_SEGMENT_SPILL), 8);
+    }
+
+    int
+    getSize(MemorySegment segment) const
+    {
+        Brig::BrigSegment brigSeg;
+
+        switch (segment) {
+          case MemorySegment::NONE:
+            brigSeg = Brig::BRIG_SEGMENT_NONE;
+            break;
+          case MemorySegment::FLAT:
+            brigSeg = Brig::BRIG_SEGMENT_FLAT;
+            break;
+          case MemorySegment::GLOBAL:
+            brigSeg = Brig::BRIG_SEGMENT_GLOBAL;
+            break;
+          case MemorySegment::READONLY:
+            brigSeg = Brig::BRIG_SEGMENT_READONLY;
+            break;
+          case MemorySegment::KERNARG:
+            brigSeg = Brig::BRIG_SEGMENT_KERNARG;
+            break;
+          case MemorySegment::GROUP:
+            brigSeg = Brig::BRIG_SEGMENT_GROUP;
+            break;
+          case MemorySegment::PRIVATE:
+            brigSeg = Brig::BRIG_SEGMENT_PRIVATE;
+            break;
+          case MemorySegment::SPILL:
+            brigSeg = Brig::BRIG_SEGMENT_SPILL;
+            break;
+          case MemorySegment::ARG:
+            brigSeg = Brig::BRIG_SEGMENT_ARG;
+            break;
+          case MemorySegment::EXTSPACE0:
+            brigSeg = Brig::BRIG_SEGMENT_AMD_GCN;
+            break;
+          default:
+            fatal("Unknown BrigSegment type.\n");
+        }
+
+        return getSize(brigSeg);
+    }
+
+  private:
+    int
+    getSize(Brig::BrigSegment segment) const
+    {
+        if (segment == Brig::BRIG_SEGMENT_PRIVATE) {
+            // with the code generated by new HSA compiler the assertion
+            // does not hold anymore..
+            //assert(private_size != -1);
+            return private_size;
+        } else {
+            return storageMap->getSize(segment);
+        }
+    }
+
+  public:
+    StorageElement*
+    findSymbol(Brig::BrigSegment segment, uint64_t addr)
+    {
+        return storageMap->findSymbol(segment, addr);
+    }
+
+    void
+    setPrivateSize(int32_t _private_size)
+    {
+        private_size = _private_size;
+    }
+
+    Label*
+    refLabel(const Brig::BrigDirectiveLabel *lbl, const BrigObject *obj)
+    {
+        return labelMap.refLabel(lbl, obj);
+    }
+};
+
+#endif // __HSAIL_CODE_HH__
diff --git a/src/gpu-compute/kernel_cfg.cc b/src/gpu-compute/kernel_cfg.cc
new file mode 100644
index 000000000..7e0e10912
--- /dev/null
+++ b/src/gpu-compute/kernel_cfg.cc
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "gpu-compute/kernel_cfg.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <string>
+
+#include "gpu-compute/gpu_static_inst.hh"
+
+void
+ControlFlowInfo::assignImmediatePostDominators(
+        const std::vector<GPUStaticInst*>& instructions)
+{
+    ControlFlowInfo cfg(instructions);
+    cfg.findImmediatePostDominators();
+}
+
+
+ControlFlowInfo::ControlFlowInfo(const std::vector<GPUStaticInst*>& insts) :
+        instructions(insts)
+{
+    createBasicBlocks();
+    connectBasicBlocks();
+}
+
+BasicBlock*
+ControlFlowInfo::basicBlock(int inst_num) const {
+    for (auto& block: basicBlocks) {
+       int first_block_id = block->firstInstruction->instNum();
+       if (inst_num >= first_block_id &&
+               inst_num < first_block_id + block->size) {
+           return block.get();
+       }
+    }
+    return nullptr;
+}
+
+
+GPUStaticInst*
+ControlFlowInfo::lastInstruction(const BasicBlock* block) const
+{
+    if (block->isExit()) {
+        return nullptr;
+    }
+
+    return instructions.at(block->firstInstruction->instNum() +
+                           block->size - 1);
+}
+
+BasicBlock*
+ControlFlowInfo::postDominator(const BasicBlock* block) const
+{
+    if (block->isExit()) {
+        return nullptr;
+    }
+    return basicBlock(lastInstruction(block)->ipdInstNum());
+}
+
+void
+ControlFlowInfo::createBasicBlocks()
+{
+    assert(!instructions.empty());
+    std::set<int> leaders;
+    // first instruction is a leader
+    leaders.insert(0);
+    for (int i = 1; i < instructions.size(); i++) {
+        GPUStaticInst* instruction = instructions[i];
+        if (instruction->o_type == Enums::OT_BRANCH) {
+            const int target_pc = instruction->getTargetPc();
+            leaders.insert(target_pc);
+            leaders.insert(i + 1);
+        }
+    }
+
+    size_t block_size = 0;
+    for (int i = 0; i < instructions.size(); i++) {
+        if (leaders.find(i) != leaders.end()) {
+            uint32_t id = basicBlocks.size();
+            if (id > 0) {
+                basicBlocks.back()->size = block_size;
+            }
+            block_size = 0;
+            basicBlocks.emplace_back(new BasicBlock(id, instructions[i]));
+        }
+        block_size++;
+    }
+    basicBlocks.back()->size = block_size;
+    // exit basic block
+    basicBlocks.emplace_back(new BasicBlock(basicBlocks.size(), nullptr));
+}
+
+void
+ControlFlowInfo::connectBasicBlocks()
+{
+    BasicBlock* exit_bb = basicBlocks.back().get();
+    for (auto& bb : basicBlocks) {
+        if (bb->isExit()) {
+            break;
+        }
+        GPUStaticInst* last = lastInstruction(bb.get());
+        if (last->o_type == Enums::OT_RET) {
+            bb->successorIds.insert(exit_bb->id);
+            break;
+        }
+        if (last->o_type == Enums::OT_BRANCH) {
+            const uint32_t target_pc = last->getTargetPc();
+            BasicBlock* target_bb = basicBlock(target_pc);
+            bb->successorIds.insert(target_bb->id);
+        }
+
+        // Unconditional jump instructions have a unique successor
+        if (!last->unconditionalJumpInstruction()) {
+            BasicBlock* next_bb = basicBlock(last->instNum() + 1);
+            bb->successorIds.insert(next_bb->id);
+        }
+    }
+}
+
+
+// In-place set intersection
+static void
+intersect(std::set<uint32_t>& a, const std::set<uint32_t>& b)
+{
+    std::set<uint32_t>::iterator it = a.begin();
+    while (it != a.end()) {
+        it = b.find(*it) != b.end() ? ++it : a.erase(it);
+    }
+}
+
+
+void
+ControlFlowInfo::findPostDominators()
+{
+    // the only postdominator of the exit block is itself
+    basicBlocks.back()->postDominatorIds.insert(basicBlocks.back()->id);
+    //copy all basic blocks to all postdominator lists except for exit block
+    for (auto& block : basicBlocks) {
+        if (!block->isExit()) {
+            for (uint32_t i = 0; i < basicBlocks.size(); i++) {
+                block->postDominatorIds.insert(i);
+            }
+        }
+    }
+
+    bool change = true;
+    while (change) {
+        change = false;
+        for (int h = basicBlocks.size() - 2; h >= 0; --h) {
+            size_t num_postdominators =
+                    basicBlocks[h]->postDominatorIds.size();
+            for (int s : basicBlocks[h]->successorIds) {
+                intersect(basicBlocks[h]->postDominatorIds,
+                          basicBlocks[s]->postDominatorIds);
+            }
+            basicBlocks[h]->postDominatorIds.insert(h);
+            change |= (num_postdominators
+                    != basicBlocks[h]->postDominatorIds.size());
+        }
+    }
+}
+
+
+// In-place set difference
+static void
+setDifference(std::set<uint32_t>&a,
+           const std::set<uint32_t>& b, uint32_t exception)
+{
+    for (uint32_t b_elem : b) {
+        if (b_elem != exception) {
+            a.erase(b_elem);
+        }
+    }
+}
+
+void
+ControlFlowInfo::findImmediatePostDominators()
+{
+    assert(basicBlocks.size() > 1); // Entry and exit blocks must be present
+
+    findPostDominators();
+
+    for (auto& basicBlock : basicBlocks) {
+        if (basicBlock->isExit()) {
+            continue;
+        }
+        std::set<uint32_t> candidates = basicBlock->postDominatorIds;
+        candidates.erase(basicBlock->id);
+        for (uint32_t postDominatorId : basicBlock->postDominatorIds) {
+            if (postDominatorId != basicBlock->id) {
+                setDifference(candidates,
+                           basicBlocks[postDominatorId]->postDominatorIds,
+                           postDominatorId);
+            }
+        }
+        assert(candidates.size() == 1);
+        GPUStaticInst* last_instruction = lastInstruction(basicBlock.get());
+        BasicBlock* ipd_block = basicBlocks[*(candidates.begin())].get();
+        if (!ipd_block->isExit()) {
+            GPUStaticInst* ipd_first_inst = ipd_block->firstInstruction;
+            last_instruction->ipdInstNum(ipd_first_inst->instNum());
+        } else {
+            last_instruction->ipdInstNum(last_instruction->instNum() + 1);
+        }
+    }
+}
+
+void
+ControlFlowInfo::printPostDominators() const
+{
+    for (auto& block : basicBlocks) {
+        std::cout << "PD(" << block->id << ") = {";
+        std::copy(block->postDominatorIds.begin(),
+                  block->postDominatorIds.end(),
+                  std::ostream_iterator<uint32_t>(std::cout, ", "));
+        std::cout << "}" << std::endl;
+    }
+}
+
+void
+ControlFlowInfo::printImmediatePostDominators() const
+{
+    for (const auto& block : basicBlocks) {
+        if (block->isExit()) {
+            continue;
+        }
+        std::cout << "IPD(" << block->id << ") = ";
+        std::cout << postDominator(block.get())->id << ", ";
+    }
+    std::cout << std::endl;
+}
+void
+ControlFlowInfo::printBasicBlocks() const
+{
+    for (GPUStaticInst* inst : instructions) {
+        int inst_num = inst->instNum();
+        std::cout << inst_num << " [" << basicBlock(inst_num)->id
+                << "]: " << inst->disassemble();
+        if (inst->o_type == Enums::OT_BRANCH) {
+            std::cout << ", PC = " << inst->getTargetPc();
+        }
+        std::cout << std::endl;
+    }
+}
+
+void
+ControlFlowInfo::printBasicBlockDot() const
+{
+    printf("digraph {\n");
+    for (const auto& basic_block : basicBlocks) {
+        printf("\t");
+        for (uint32_t successorId : basic_block->successorIds) {
+            printf("%d -> %d; ", basic_block->id, successorId);
+        }
+        printf("\n");
+    }
+    printf("}\n");
+}
diff --git a/src/gpu-compute/kernel_cfg.hh b/src/gpu-compute/kernel_cfg.hh
new file mode 100644
index 000000000..74ea861d8
--- /dev/null
+++ b/src/gpu-compute/kernel_cfg.hh
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __KERNEL_CFG_HH__
+#define __KERNEL_CFG_HH__
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <set>
+#include <vector>
+
+
+class GPUStaticInst;
+class HsailCode;
+
+struct BasicBlock
+{
+    BasicBlock(uint32_t num, GPUStaticInst* begin) :
+            id(num), size(0), firstInstruction(begin)
+    {
+    }
+
+    bool
+    isEntry() const
+    {
+        return !id;
+    }
+
+    bool
+    isExit() const
+    {
+        return !size;
+    }
+
+    /**
+     * Unique identifier for the block within a given kernel.
+     */
+    const uint32_t id;
+
+    /**
+     * Number of instructions contained in the block
+     */
+    size_t size;
+
+    /**
+     * Pointer to first instruction of the block.
+     */
+    GPUStaticInst* firstInstruction;
+
+    /**
+     * Identifiers of the blocks that follow (are reachable from) this block.
+     */
+    std::set<uint32_t> successorIds;
+
+    /**
+     * Identifiers of the blocks that will be visited from this block.
+     */
+    std::set<uint32_t> postDominatorIds;
+};
+
+class ControlFlowInfo
+{
+public:
+
+    /**
+     * Compute immediate post-dominator instruction for kernel instructions.
+     */
+    static void assignImmediatePostDominators(
+            const std::vector<GPUStaticInst*>& instructions);
+
+private:
+    ControlFlowInfo(const std::vector<GPUStaticInst*>& instructions);
+
+    GPUStaticInst* lastInstruction(const BasicBlock* block) const;
+
+    BasicBlock* basicBlock(int inst_num) const;
+
+    BasicBlock* postDominator(const BasicBlock* block) const;
+
+    void createBasicBlocks();
+
+    void connectBasicBlocks();
+
+    void findPostDominators();
+
+    void findImmediatePostDominators();
+
+    void printBasicBlocks() const;
+
+    void printBasicBlockDot() const;
+
+    void printPostDominators() const;
+
+    void printImmediatePostDominators() const;
+
+    std::vector<std::unique_ptr<BasicBlock>> basicBlocks;
+    std::vector<GPUStaticInst*> instructions;
+};
+
+#endif // __KERNEL_CFG_HH__
diff --git a/src/gpu-compute/lds_state.cc b/src/gpu-compute/lds_state.cc
new file mode 100644
index 000000000..91ee8009a
--- /dev/null
+++ b/src/gpu-compute/lds_state.cc
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Joe Gross
+ */
+
+#include "gpu-compute/lds_state.hh"
+
+#include <array>
+#include <cstdio>
+#include <cstdlib>
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+
+/**
+ * the default constructor that works with SWIG
+ */
+LdsState::LdsState(const Params *params) :
+    MemObject(params),
+    tickEvent(this),
+    cuPort(name() + ".port", this),
+    maximumSize(params->size),
+    range(params->range),
+    bankConflictPenalty(params->bankConflictPenalty),
+    banks(params->banks)
+{
+    fatal_if(params->banks <= 0,
+             "Number of LDS banks should be positive number");
+    fatal_if((params->banks & (params->banks - 1)) != 0,
+             "Number of LDS banks should be a power of 2");
+    fatal_if(params->size <= 0,
+             "cannot allocate an LDS with a size less than 1");
+    fatal_if(params->size % 2,
+          "the LDS should be an even number");
+}
+
+/**
+ * Needed by the SWIG compiler
+ */
+LdsState *
+LdsStateParams::create()
+{
+    return new LdsState(this);
+}
+
+/**
+ * set the parent and name based on the parent
+ */
+void
+LdsState::setParent(ComputeUnit *x_parent)
+{
+    // check that this gets assigned to the same thing each time
+    fatal_if(!x_parent, "x_parent should not be nullptr");
+    fatal_if(x_parent == parent,
+             "should not be setting the parent twice");
+
+    parent = x_parent;
+    _name = x_parent->name() + ".LdsState";
+}
+
+/**
+ * derive the gpu mem packet from the packet and then count the bank conflicts
+ */
+unsigned
+LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses)
+{
+    Packet::SenderState *baseSenderState = packet->senderState;
+    while (baseSenderState->predecessor) {
+        baseSenderState = baseSenderState->predecessor;
+    }
+    const ComputeUnit::LDSPort::SenderState *senderState =
+            dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState);
+
+    fatal_if(!senderState,
+             "did not get the right sort of sender state");
+
+    GPUDynInstPtr gpuDynInst = senderState->getMemInst();
+
+    return countBankConflicts(gpuDynInst, bankAccesses);
+}
+
+// Count the total number of bank conflicts for the local memory packet
+unsigned
+LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst,
+                             unsigned *numBankAccesses)
+{
+    int bank_conflicts = 0;
+    std::vector<int> bank;
+    // the number of LDS banks being touched by the memory instruction
+    int numBanks = std::min(parent->wfSize(), banks);
+    // if the wavefront size is larger than the number of LDS banks, we
+    // need to iterate over all work items to calculate the total
+    // number of bank conflicts
+    int groups = (parent->wfSize() > numBanks) ?
+        (parent->wfSize() / numBanks) : 1;
+    for (int i = 0; i < groups; i++) {
+        // Address Array holding all the work item addresses of an instruction
+        std::vector<Addr> addr_array;
+        addr_array.resize(numBanks, 0);
+        bank.clear();
+        bank.resize(banks, 0);
+        int max_bank = 0;
+
+        // populate the address array for all active work items
+        for (int j = 0; j < numBanks; j++) {
+            if (gpuDynInst->exec_mask[(i*numBanks)+j]) {
+                addr_array[j] = gpuDynInst->addr[(i*numBanks)+j];
+            } else {
+                addr_array[j] = std::numeric_limits<Addr>::max();
+            }
+        }
+
+        if (gpuDynInst->m_op == Enums::MO_LD ||
+            gpuDynInst->m_op == Enums::MO_ST) {
+            // mask identical addresses
+            for (int j = 0; j < numBanks; ++j) {
+                for (int j0 = 0; j0 < j; j0++) {
+                    if (addr_array[j] != std::numeric_limits<Addr>::max()
+                                    && addr_array[j] == addr_array[j0]) {
+                        addr_array[j] = std::numeric_limits<Addr>::max();
+                    }
+                }
+            }
+        }
+        // calculate bank conflicts
+        for (int j = 0; j < numBanks; ++j) {
+            if (addr_array[j] != std::numeric_limits<Addr>::max()) {
+                int bankId = addr_array[j] % banks;
+                bank[bankId]++;
+                max_bank = std::max(max_bank, bank[bankId]);
+                // Count the number of LDS banks accessed.
+                // Since we have masked identical addresses all remaining
+                // accesses will need to be serialized if they access
+                // the same bank (bank conflict).
+                (*numBankAccesses)++;
+            }
+        }
+        bank_conflicts += max_bank;
+    }
+    panic_if(bank_conflicts > parent->wfSize(),
+             "Max bank conflicts should match num of work items per instr");
+    return bank_conflicts;
+}
+
+/**
+ * receive the packet from the CU
+ */
+bool
+LdsState::CuSidePort::recvTimingReq(PacketPtr packet)
+{
+    return ownerLds->processPacket(packet);
+}
+
+GPUDynInstPtr
+LdsState::getDynInstr(PacketPtr packet)
+{
+    ComputeUnit::LDSPort::SenderState *ss =
+        dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
+                     packet->senderState);
+    return ss->getMemInst();
+}
+
+/**
+ * process an incoming packet, add it to the return queue
+ */
+bool
+LdsState::processPacket(PacketPtr packet)
+{
+    unsigned bankAccesses = 0;
+    // the number of conflicts this packet will have when accessing the LDS
+    unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
+    // count the total number of physical LDS bank accessed
+    parent->ldsBankAccesses += bankAccesses;
+    // count the LDS bank conflicts. A number set to 1 indicates one
+    // access per bank maximum so there are no bank conflicts
+    parent->ldsBankConflictDist.sample(bankConflicts-1);
+
+    GPUDynInstPtr dynInst = getDynInstr(packet);
+    // account for the LDS bank conflict overhead
+    int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() :
+        (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() :
+        parent->loadBusLength();
+    // delay for accessing the LDS
+    Tick processingTime =
+        parent->shader->ticks(bankConflicts * bankConflictPenalty) +
+        parent->shader->ticks(busLength);
+    // choose (delay + last packet in queue) or (now + delay) as the time to
+    // return this
+    Tick doneAt = earliestReturnTime() + processingTime;
+    // then store it for processing
+    return returnQueuePush(std::make_pair(doneAt, packet));
+}
+
+/**
+ * add this to the queue of packets to be returned
+ */
+bool
+LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair)
+{
+    // TODO add time limits (e.g. one packet per cycle) and queue size limits
+    // and implement flow control
+    returnQueue.push(thePair);
+
+    // if there is no set wakeup time, look through the queue
+    if (!tickEvent.scheduled()) {
+        process();
+    }
+
+    return true;
+}
+
+/**
+ * receive a packet in functional mode
+ */
+void
+LdsState::CuSidePort::recvFunctional(PacketPtr pkt)
+{
+    fatal("not implemented");
+}
+
+/**
+ * receive a retry for a response
+ */
+void
+LdsState::CuSidePort::recvRespRetry()
+{
+    // TODO verify that this is the right way to do this
+    assert(ownerLds->isRetryResp());
+    ownerLds->setRetryResp(false);
+    ownerLds->process();
+}
+
+/**
+ * receive a retry
+ */
+void
+LdsState::CuSidePort::recvRetry()
+{
+    fatal("not implemented");
+}
+
+/**
+ * look for packets to return at this time
+ */
+bool
+LdsState::process()
+{
+    Tick now = clockEdge();
+
+    // send back completed packets
+    while (!returnQueue.empty() && returnQueue.front().first <= now) {
+        PacketPtr packet = returnQueue.front().second;
+
+        ComputeUnit::LDSPort::SenderState *ss =
+            dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
+                            packet->senderState);
+
+        GPUDynInstPtr gpuDynInst = ss->getMemInst();
+
+        gpuDynInst->initiateAcc(gpuDynInst);
+
+        packet->makeTimingResponse();
+
+        returnQueue.pop();
+
+        bool success = cuPort.sendTimingResp(packet);
+
+        if (!success) {
+            retryResp = true;
+            panic("have not handled timing responses being NACK'd when sent"
+                            "back");
+        }
+    }
+
+    // determine the next wakeup time
+    if (!returnQueue.empty()) {
+
+        Tick next = returnQueue.front().first;
+
+        if (tickEvent.scheduled()) {
+
+            if (next < tickEvent.when()) {
+
+                tickEvent.deschedule();
+                tickEvent.schedule(next);
+            }
+        } else {
+            tickEvent.schedule(next);
+        }
+    }
+
+    return true;
+}
+
+/**
+ * wake up at this time and perform specified actions
+ */
+void
+LdsState::TickEvent::process()
+{
+    ldsState->process();
+}
+
+/**
+ *
+ */
+void
+LdsState::regStats()
+{
+}
diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh
new file mode 100644
index 000000000..89f08a1d3
--- /dev/null
+++ b/src/gpu-compute/lds_state.hh
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Joe Gross
+ */
+
+#ifndef __LDS_STATE_HH__
+#define __LDS_STATE_HH__
+
+#include <array>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "enums/MemOpType.hh"
+#include "enums/MemType.hh"
+#include "gpu-compute/misc.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+#include "params/LdsState.hh"
+
+class ComputeUnit;
+
+/**
+ * this represents a slice of the overall LDS, intended to be associated with an
+ * individual workgroup
+ */
+class LdsChunk
+{
+  public:
+    LdsChunk(const uint32_t x_size):
+        chunk(x_size)
+    {
+    }
+
+    LdsChunk() {}
+
+    /**
+     * a read operation
+     */
+    template<class T>
+    T
+    read(const uint32_t index)
+    {
+        fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0");
+        fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
+        T *p0 = (T *) (&(chunk.at(index)));
+        return *p0;
+    }
+
+    /**
+     * a write operation
+     */
+    template<class T>
+    void
+    write(const uint32_t index, const T value)
+    {
+        fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0");
+        fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
+        T *p0 = (T *) (&(chunk.at(index)));
+        *p0 = value;
+    }
+
+    /**
+     * get the size of this chunk
+     */
+    std::vector<uint8_t>::size_type
+    size() const
+    {
+        return chunk.size();
+    }
+
+  protected:
+    // the actual data store for this slice of the LDS
+    std::vector<uint8_t> chunk;
+};
+
+// Local Data Share (LDS) State per Wavefront (contents of the LDS region
+// allocated to the WorkGroup of this Wavefront)
+class LdsState: public MemObject
+{
+  protected:
+
+    /**
+     * an event to allow event-driven execution
+     */
+    class TickEvent: public Event
+    {
+      protected:
+
+        LdsState *ldsState = nullptr;
+
+        Tick nextTick = 0;
+
+      public:
+
+        TickEvent(LdsState *_ldsState) :
+            ldsState(_ldsState)
+        {
+        }
+
+        virtual void
+        process();
+
+        void
+        schedule(Tick when)
+        {
+            mainEventQueue[0]->schedule(this, when);
+        }
+
+        void
+        deschedule()
+        {
+            mainEventQueue[0]->deschedule(this);
+        }
+    };
+
+    /**
+     * CuSidePort is the LDS Port closer to the CU side
+     */
+    class CuSidePort: public SlavePort
+    {
+      public:
+        CuSidePort(const std::string &_name, LdsState *_ownerLds) :
+                SlavePort(_name, _ownerLds), ownerLds(_ownerLds)
+        {
+        }
+
+      protected:
+        LdsState *ownerLds;
+
+        virtual bool
+        recvTimingReq(PacketPtr pkt);
+
+        virtual Tick
+        recvAtomic(PacketPtr pkt)
+        {
+          return 0;
+        }
+
+        virtual void
+        recvFunctional(PacketPtr pkt);
+
+        virtual void
+        recvRangeChange()
+        {
+        }
+
+        virtual void
+        recvRetry();
+
+        virtual void
+        recvRespRetry();
+
+        virtual AddrRangeList
+        getAddrRanges() const
+        {
+          AddrRangeList ranges;
+          ranges.push_back(ownerLds->getAddrRange());
+          return ranges;
+        }
+
+        template<typename T>
+        void
+        loadData(PacketPtr packet);
+
+        template<typename T>
+        void
+        storeData(PacketPtr packet);
+
+        template<typename T>
+        void
+        atomicOperation(PacketPtr packet);
+    };
+
+  protected:
+
+    // the lds reference counter
+    // The key is the workgroup ID and dispatch ID
+    // The value is the number of wavefronts that reference this LDS, as
+    // wavefronts are launched, the counter goes up for that workgroup and when
+    // they return it decreases, once it reaches 0 then this chunk of the LDS is
+    // returned to the available pool. However,it is deallocated on the 1->0
+    // transition, not whenever the counter is 0 as it always starts with 0 when
+    // the workgroup asks for space
+    std::unordered_map<uint32_t,
+                       std::unordered_map<uint32_t, int32_t>> refCounter;
+
+    // the map that allows workgroups to access their own chunk of the LDS
+    std::unordered_map<uint32_t,
+                       std::unordered_map<uint32_t, LdsChunk>> chunkMap;
+
+    // an event to allow the LDS to wake up at a specified time
+    TickEvent tickEvent;
+
+    // the queue of packets that are going back to the CU after a
+    // read/write/atomic op
+    // TODO need to make this have a maximum size to create flow control
+    std::queue<std::pair<Tick, PacketPtr>> returnQueue;
+
+    // whether or not there are pending responses
+    bool retryResp = false;
+
+    bool
+    process();
+
+    GPUDynInstPtr
+    getDynInstr(PacketPtr packet);
+
+    bool
+    processPacket(PacketPtr packet);
+
+    unsigned
+    countBankConflicts(PacketPtr packet, unsigned *bankAccesses);
+
+    unsigned
+    countBankConflicts(GPUDynInstPtr gpuDynInst,
+                       unsigned *numBankAccesses);
+
+  public:
+    typedef LdsStateParams Params;
+
+    LdsState(const Params *params);
+
+    // prevent copy construction
+    LdsState(const LdsState&) = delete;
+
+    ~LdsState()
+    {
+        parent = nullptr;
+    }
+
+    const Params *
+    params() const
+    {
+        return dynamic_cast<const Params *>(_params);
+    }
+
+    bool
+    isRetryResp() const
+    {
+        return retryResp;
+    }
+
+    void
+    setRetryResp(const bool value)
+    {
+        retryResp = value;
+    }
+
+    // prevent assignment
+    LdsState &
+    operator=(const LdsState &) = delete;
+
+    /**
+     * use the dynamic wave id to create or just increase the reference count
+     */
+    int
+    increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
+    {
+        int refCount = getRefCounter(dispatchId, wgId);
+        fatal_if(refCount < 0,
+                 "reference count should not be below zero");
+        return ++refCounter[dispatchId][wgId];
+    }
+
+    /**
+     * decrease the reference count after making sure it is in the list
+     * give back this chunk if the ref counter has reached 0
+     */
+    int
+    decreaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
+    {
+      int refCount = getRefCounter(dispatchId, wgId);
+
+      fatal_if(refCount <= 0,
+              "reference count should not be below zero or at zero to"
+              "decrement");
+
+      refCounter[dispatchId][wgId]--;
+
+      if (refCounter[dispatchId][wgId] == 0) {
+        releaseSpace(dispatchId, wgId);
+        return 0;
+      } else {
+        return refCounter[dispatchId][wgId];
+      }
+    }
+
+    /**
+     * return the current reference count for this workgroup id
+     */
+    int
+    getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
+    {
+      auto dispatchIter = chunkMap.find(dispatchId);
+      fatal_if(dispatchIter == chunkMap.end(),
+               "could not locate this dispatch id [%d]", dispatchId);
+
+      auto workgroup = dispatchIter->second.find(wgId);
+      fatal_if(workgroup == dispatchIter->second.end(),
+               "could not find this workgroup id within this dispatch id"
+               " did[%d] wgid[%d]", dispatchId, wgId);
+
+      auto refCountIter = refCounter.find(dispatchId);
+      if (refCountIter == refCounter.end()) {
+        fatal("could not locate this dispatch id [%d]", dispatchId);
+      } else {
+        auto workgroup = refCountIter->second.find(wgId);
+        if (workgroup == refCountIter->second.end()) {
+          fatal("could not find this workgroup id within this dispatch id"
+                  " did[%d] wgid[%d]", dispatchId, wgId);
+        } else {
+          return refCounter.at(dispatchId).at(wgId);
+        }
+      }
+
+      fatal("should not reach this point");
+      return 0;
+    }
+
+    /**
+     * assign a parent and request this amount of space be set aside
+     * for this wgid
+     */
+    LdsChunk *
+    reserveSpace(const uint32_t dispatchId, const uint32_t wgId,
+            const uint32_t size)
+    {
+        if (chunkMap.find(dispatchId) != chunkMap.end()) {
+            fatal_if(
+                chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
+                "duplicate workgroup ID asking for space in the LDS "
+                "did[%d] wgid[%d]", dispatchId, wgId);
+        }
+
+        fatal_if(bytesAllocated + size > maximumSize,
+                 "request would ask for more space than is available");
+
+        bytesAllocated += size;
+
+        chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
+        // make an entry for this workgroup
+        refCounter[dispatchId][wgId] = 0;
+
+        return &chunkMap[dispatchId][wgId];
+    }
+
+    bool
+    returnQueuePush(std::pair<Tick, PacketPtr> thePair);
+
+    Tick
+    earliestReturnTime() const
+    {
+        // TODO set to max(lastCommand+1, curTick())
+        return returnQueue.empty() ? curTick() : returnQueue.back().first;
+    }
+
+    void
+    setParent(ComputeUnit *x_parent);
+
+    void
+    regStats();
+
+    // accessors
+    ComputeUnit *
+    getParent() const
+    {
+        return parent;
+    }
+
+    std::string
+    getName()
+    {
+        return _name;
+    }
+
+    int
+    getBanks() const
+    {
+        return banks;
+    }
+
+    ComputeUnit *
+    getComputeUnit() const
+    {
+        return parent;
+    }
+
+    int
+    getBankConflictPenalty() const
+    {
+        return bankConflictPenalty;
+    }
+
+    /**
+     * get the allocated size for this workgroup
+     */
+    std::size_t
+    ldsSize(const uint32_t x_wgId)
+    {
+        return chunkMap[x_wgId].size();
+    }
+
+    AddrRange
+    getAddrRange() const
+    {
+        return range;
+    }
+
+    virtual BaseSlavePort &
+    getSlavePort(const std::string& if_name, PortID idx)
+    {
+        if (if_name == "cuPort") {
+            // TODO need to set name dynamically at this point?
+            return cuPort;
+        } else {
+            fatal("cannot resolve the port name " + if_name);
+        }
+    }
+
+    /**
+     * can this much space be reserved for a workgroup?
+     */
+    bool
+    canReserve(uint32_t x_size) const
+    {
+      return bytesAllocated + x_size <= maximumSize;
+    }
+
+  private:
+    /**
+     * give back the space
+     */
+    bool
+    releaseSpace(const uint32_t x_dispatchId, const uint32_t x_wgId)
+    {
+        auto dispatchIter = chunkMap.find(x_dispatchId);
+
+        if (dispatchIter == chunkMap.end()) {
+          fatal("dispatch id not found [%d]", x_dispatchId);
+        } else {
+          auto workgroupIter = dispatchIter->second.find(x_wgId);
+          if (workgroupIter == dispatchIter->second.end()) {
+            fatal("workgroup id [%d] not found in dispatch id [%d]",
+                    x_wgId, x_dispatchId);
+          }
+        }
+
+        fatal_if(bytesAllocated < chunkMap[x_dispatchId][x_wgId].size(),
+                 "releasing more space than was allocated");
+
+        bytesAllocated -= chunkMap[x_dispatchId][x_wgId].size();
+        chunkMap[x_dispatchId].erase(chunkMap[x_dispatchId].find(x_wgId));
+        return true;
+    }
+
+    // the port that connects this LDS to its owner CU
+    CuSidePort cuPort;
+
+    ComputeUnit* parent = nullptr;
+
+    std::string _name;
+
+    // the number of bytes currently reserved by all workgroups
+    int bytesAllocated = 0;
+
+    // the size of the LDS, the most bytes available
+    int maximumSize;
+
+    // Address range of this memory
+    AddrRange range;
+
+    // the penalty, in cycles, for each LDS bank conflict
+    int bankConflictPenalty = 0;
+
+    // the number of banks in the LDS underlying data store
+    int banks = 0;
+};
+
+#endif // __LDS_STATE_HH__
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc
new file mode 100644
index 000000000..7f919c5f4
--- /dev/null
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/local_memory_pipeline.hh"
+
+#include "debug/GPUPort.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p) :
+    computeUnit(nullptr), lmQueueSize(p->local_mem_queue_size)
+{
+}
+
+void
+LocalMemPipeline::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    _name = computeUnit->name() + ".LocalMemPipeline";
+}
+
+void
+LocalMemPipeline::exec()
+{
+    // apply any returned shared (LDS) memory operations
+    GPUDynInstPtr m = !lmReturnedRequests.empty() ?
+        lmReturnedRequests.front() : nullptr;
+
+    bool accessVrf = true;
+    if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) {
+        Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+        accessVrf =
+            w->computeUnit->vrf[m->simdId]->
+            vrfOperandAccessReady(m->seqNum(), w, m,
+                                  VrfAccessType::WRITE);
+    }
+
+    if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
+        computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return
+                 || computeUnit->wfWait.at(m->pipeId).rdy())) {
+        if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
+            doSmReturn<uint32_t, uint8_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
+            doSmReturn<uint32_t, uint16_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
+            doSmReturn<uint32_t, uint32_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
+            doSmReturn<int32_t, int8_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
+            doSmReturn<int32_t, int16_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
+            doSmReturn<int32_t, int32_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
+            doSmReturn<float, Float16>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
+            doSmReturn<float, float>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
+            doSmReturn<uint64_t, uint8_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
+            doSmReturn<uint64_t, uint16_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
+            doSmReturn<uint64_t, uint32_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
+            doSmReturn<uint64_t, uint64_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
+            doSmReturn<int64_t, int8_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
+            doSmReturn<int64_t, int16_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
+            doSmReturn<int64_t, int32_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
+            doSmReturn<int64_t, int64_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
+            doSmReturn<double, Float16>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
+            doSmReturn<double, float>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
+            doSmReturn<double, double>(m);
+    }
+
+    // If pipeline has executed a local memory instruction
+    // execute local memory packet and issue the packets
+    // to LDS
+    if (!lmIssuedRequests.empty() && lmReturnedRequests.size() < lmQueueSize) {
+
+        GPUDynInstPtr m = lmIssuedRequests.front();
+
+        bool returnVal = computeUnit->sendToLds(m);
+        if (!returnVal) {
+            DPRINTF(GPUPort, "packet was nack'd and put in retry queue");
+        }
+        lmIssuedRequests.pop();
+    }
+}
+
+template<typename c0, typename c1>
+void
+LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
+{
+    lmReturnedRequests.pop();
+    Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+    // Return data to registers
+    if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) {
+        std::vector<uint32_t> regVec;
+        for (int k = 0; k < m->n_reg; ++k) {
+            int dst = m->dst_reg+k;
+
+            if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
+                dst = m->dst_reg_vec[k];
+            // virtual->physical VGPR mapping
+            int physVgpr = w->remap(dst,sizeof(c0),1);
+            // save the physical VGPR index
+            regVec.push_back(physVgpr);
+            c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+
+            for (int i = 0; i < VSZ; ++i) {
+                if (m->exec_mask[i]) {
+                    // write the value into the physical VGPR. This is a purely
+                    // functional operation. No timing is modeled.
+                    w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
+                                                                *p1, i);
+                }
+                ++p1;
+            }
+        }
+
+        // Schedule the write operation of the load data on the VRF. This simply
+        // models the timing aspect of the VRF write operation. It does not
+        // modify the physical VGPR.
+        loadVrfBankConflictCycles +=
+            w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), w,
+                                                 regVec, sizeof(c0), m->time);
+    }
+
+    // Decrement outstanding request count
+    computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1);
+
+    if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op)
+        || MO_H(m->m_op)) {
+        computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_lm,
+                                         m->time, -1);
+    }
+
+    if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
+        computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_lm,
+                                         m->time, -1);
+    }
+
+    // Mark write bus busy for appropriate amount of time
+    computeUnit->locMemToVrfBus.set(m->time);
+    if (computeUnit->shader->coissue_return == 0)
+        w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+}
+
+void
+LocalMemPipeline::regStats()
+{
+    loadVrfBankConflictCycles
+        .name(name() + ".load_vrf_bank_conflict_cycles")
+        .desc("total number of cycles LDS data are delayed before updating "
+              "the VRF")
+        ;
+}
diff --git a/src/gpu-compute/local_memory_pipeline.hh b/src/gpu-compute/local_memory_pipeline.hh
new file mode 100644
index 000000000..a63d867d0
--- /dev/null
+++ b/src/gpu-compute/local_memory_pipeline.hh
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __LOCAL_MEMORY_PIPELINE_HH__
+#define __LOCAL_MEMORY_PIPELINE_HH__
+
+#include <queue>
+#include <string>
+
+#include "gpu-compute/misc.hh"
+#include "params/ComputeUnit.hh"
+#include "sim/stats.hh"
+
+/*
+ * @file local_memory_pipeline.hh
+ *
+ * The local memory pipeline issues newly created local memory packets
+ * from pipeline to the LDS. This stage also retires previously issued
+ * loads and stores that have returned from the LDS.
+ */
+
+class ComputeUnit;
+class Wavefront;
+
+class LocalMemPipeline
+{
+  public:
+    LocalMemPipeline(const ComputeUnitParams *params);
+    void init(ComputeUnit *cu);
+    void exec();
+
+    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr m);
+
+    std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; }
+    std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
+
+    bool
+    isLMRespFIFOWrRdy() const
+    {
+        return lmReturnedRequests.size() < lmQueueSize;
+    }
+
+    bool
+    isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
+    {
+        return (lmIssuedRequests.size() + pendReqs) < lmQueueSize;
+    }
+
+    const std::string& name() const { return _name; }
+    void regStats();
+
+  private:
+    ComputeUnit *computeUnit;
+    std::string _name;
+    int lmQueueSize;
+    Stats::Scalar loadVrfBankConflictCycles;
+    // Local Memory Request Fifo: all shared memory requests
+    // are issued to this FIFO from the memory pipelines
+    std::queue<GPUDynInstPtr> lmIssuedRequests;
+
+    // Local Memory Response Fifo: all responses of shared memory
+    // requests are sent to this FIFO from LDS
+    std::queue<GPUDynInstPtr> lmReturnedRequests;
+};
+
+#endif // __LOCAL_MEMORY_PIPELINE_HH__
diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh
new file mode 100644
index 000000000..4f8032832
--- /dev/null
+++ b/src/gpu-compute/misc.hh
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __MISC_HH__
+#define __MISC_HH__
+
+#include <bitset>
+#include <memory>
+
+#include "base/misc.hh"
+
+class GPUDynInst;
+
+// wavefront size of the machine
+static const int VSZ = 64;
+
+/*
+ This check is necessary because std::bitset only provides conversion to
+ unsigned long or unsigned long long via to_ulong() or to_ullong(). there are
+ a few places in the code where to_ullong() is used, however if VSZ is larger
+ than a value the host can support then bitset will throw a runtime exception.
+
+ we should remove all use of to_long() or to_ullong() so we can have VSZ
+ greater than 64b, however until that is done this assert is required.
+ */
+static_assert(VSZ <= sizeof(unsigned long long) * 8,
+              "VSZ is larger than the host can support");
+
+typedef std::bitset<VSZ> VectorMask;
+typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
+
+class WaitClass
+{
+  public:
+    WaitClass() : nxtAvail(0), lookAheadAvail(0), tcnt(0) { }
+    void init(uint64_t *_tcnt, uint32_t _numStages=0)
+    {
+        tcnt = _tcnt;
+        numStages = _numStages;
+    }
+
+    void set(uint32_t i)
+    {
+        fatal_if(nxtAvail > *tcnt,
+                 "Can't allocate resource because it is busy!!!");
+        nxtAvail = *tcnt + i;
+    }
+    void preset(uint32_t delay)
+    {
+        lookAheadAvail = std::max(lookAheadAvail, delay + (*tcnt) - numStages);
+    }
+    bool rdy() const { return *tcnt >= nxtAvail; }
+    bool prerdy() const { return *tcnt >= lookAheadAvail; }
+
+  private:
+    // timestamp indicating when resource will be available
+    uint64_t nxtAvail;
+    // timestamp indicating when resource will be available including
+    // pending uses of the resource (when there is a cycle gap between
+    // rdy() and set()
+    uint64_t lookAheadAvail;
+    // current timestamp
+    uint64_t *tcnt;
+    // number of stages between checking if a resource is ready and
+    // setting the resource's utilization
+    uint32_t numStages;
+};
+
+class Float16
+{
+  public:
+    uint16_t val;
+
+    Float16() { val = 0; }
+
+    Float16(const Float16 &x) : val(x.val) { }
+
+    Float16(float x)
+    {
+        uint32_t ai = *(uint32_t *)&x;
+
+        uint32_t s = (ai >> 31) & 0x1;
+        uint32_t exp = (ai >> 23) & 0xff;
+        uint32_t mant = (ai >> 0) & 0x7fffff;
+
+        if (exp == 0 || exp <= 0x70) {
+            exp = 0;
+            mant = 0;
+        } else if (exp == 0xff) {
+            exp = 0x1f;
+        } else if (exp >= 0x8f) {
+            exp = 0x1f;
+            mant = 0;
+        } else {
+            exp = exp - 0x7f + 0x0f;
+        }
+
+        mant = mant >> 13;
+
+        val = 0;
+        val |= (s << 15);
+        val |= (exp << 10);
+        val |= (mant << 0);
+    }
+
+    operator float() const
+    {
+        uint32_t s = (val >> 15) & 0x1;
+        uint32_t exp = (val >> 10) & 0x1f;
+        uint32_t mant = (val >> 0) & 0x3ff;
+
+        if (!exp) {
+            exp = 0;
+            mant = 0;
+        } else if (exp == 0x1f) {
+            exp = 0xff;
+        } else {
+            exp = exp - 0x0f + 0x7f;
+        }
+
+        uint32_t val1 = 0;
+        val1 |= (s << 31);
+        val1 |= (exp << 23);
+        val1 |= (mant << 13);
+
+        return *(float*)&val1;
+    }
+};
+
+#endif // __MISC_HH__
diff --git a/src/gpu-compute/ndrange.hh b/src/gpu-compute/ndrange.hh
new file mode 100644
index 000000000..d1ad35d4b
--- /dev/null
+++ b/src/gpu-compute/ndrange.hh
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __NDRANGE_HH__
+#define __NDRANGE_HH__
+
+#include "base/types.hh"
+#include "gpu-compute/qstruct.hh"
+
+struct NDRange
+{
+    // copy of the queue entry provided at dispatch
+    HsaQueueEntry q;
+
+    // The current workgroup id (3 dimensions)
+    int wgId[3];
+    // The number of workgroups in each dimension
+    int numWg[3];
+    // The total number of workgroups
+    int numWgTotal;
+
+    // The number of completed work groups
+    int numWgCompleted;
+    // The global workgroup ID
+    uint32_t globalWgId;
+
+    // flag indicating whether all work groups have been launched
+    bool wg_disp_rem;
+    // kernel complete
+    bool execDone;
+    bool userDoorBellSet;
+    volatile bool *addrToNotify;
+    volatile uint32_t *numDispLeft;
+    int dispatchId;
+    int curTid; // Current thread id
+};
+
+#endif // __NDRANGE_HH__
diff --git a/src/gpu-compute/of_scheduling_policy.cc b/src/gpu-compute/of_scheduling_policy.cc
new file mode 100644
index 000000000..7f114706a
--- /dev/null
+++ b/src/gpu-compute/of_scheduling_policy.cc
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/of_scheduling_policy.hh"
+
+#include "gpu-compute/wavefront.hh"
+
+Wavefront*
+OFSchedulingPolicy::chooseWave()
+{
+    // Set when policy choose a wave to schedule
+    bool waveChosen = false;
+    Wavefront *selectedWave = nullptr;
+    int selectedWaveID = -1;
+    uint32_t selectedPosition = 0;
+
+    for (int position = 0; position < scheduleList->size(); ++position) {
+        Wavefront *curWave = scheduleList->at(position);
+        uint32_t curWaveID = curWave->wfDynId;
+
+        // Choosed wave with the lowest wave ID
+        if (selectedWaveID == -1 || curWaveID < selectedWaveID) {
+            waveChosen = true;
+            selectedWaveID = curWaveID;
+            selectedWave = curWave;
+            selectedPosition = position;
+        }
+    }
+
+    // Check to make sure ready list had atleast one schedulable wave
+    if (waveChosen) {
+        scheduleList->erase(scheduleList->begin() + selectedPosition);
+    } else {
+        panic("Empty ready list");
+    }
+
+    return selectedWave;
+}
+
+void
+OFSchedulingPolicy::bindList(std::vector<Wavefront*> *list)
+{
+    scheduleList = list;
+}
diff --git a/src/gpu-compute/of_scheduling_policy.hh b/src/gpu-compute/of_scheduling_policy.hh
new file mode 100644
index 000000000..684e51a3a
--- /dev/null
+++ b/src/gpu-compute/of_scheduling_policy.hh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __OF_SCHEDULING_POLICY_HH__
+#define __OF_SCHEDULING_POLICY_HH__
+
+#include <cstddef>
+#include <vector>
+
+#include "base/misc.hh"
+
+class Wavefront;
+
+// Oldest First where age is marked by the wave id
+class OFSchedulingPolicy
+{
+  public:
+    OFSchedulingPolicy() : scheduleList(nullptr) { }
+
+    Wavefront* chooseWave();
+    void bindList(std::vector<Wavefront*> *list);
+
+  private:
+    // List of waves which are participating in scheduling.
+    // This scheduler selects the oldest wave from this list
+    std::vector<Wavefront*> *scheduleList;
+};
+
+#endif // __OF_SCHEDULING_POLICY_HH__
diff --git a/src/gpu-compute/pool_manager.cc b/src/gpu-compute/pool_manager.cc
new file mode 100644
index 000000000..b1bc6b1f3
--- /dev/null
+++ b/src/gpu-compute/pool_manager.cc
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/pool_manager.hh"
+
+PoolManager::PoolManager(uint32_t minAlloc, uint32_t poolSize)
+    : _minAllocation(minAlloc), _poolSize(poolSize)
+{
+    assert(poolSize > 0);
+}
diff --git a/src/gpu-compute/pool_manager.hh b/src/gpu-compute/pool_manager.hh
new file mode 100644
index 000000000..2cb53ce72
--- /dev/null
+++ b/src/gpu-compute/pool_manager.hh
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __POOL_MANAGER_HH__
+#define __POOL_MANAGER_HH__
+
+#include <cassert>
+#include <cstdint>
+#include <string>
+
+// Pool Manager Logic
+class PoolManager
+{
+  public:
+    PoolManager(uint32_t minAlloc, uint32_t poolSize);
+    uint32_t minAllocation() { return _minAllocation; }
+    virtual std::string printRegion() = 0;
+    virtual uint32_t regionSize(std::pair<uint32_t,uint32_t> &region) = 0;
+    virtual bool canAllocate(uint32_t numRegions, uint32_t size) = 0;
+
+    virtual uint32_t allocateRegion(const uint32_t size,
+                                    uint32_t *reserved) = 0;
+
+    virtual void freeRegion(uint32_t firstIdx, uint32_t lastIdx) = 0;
+    uint32_t poolSize() { return _poolSize; }
+
+  private:
+    // minimum size that can be reserved per allocation
+    uint32_t _minAllocation;
+    // pool size in number of elements
+    uint32_t _poolSize;
+};
+
+#endif // __POOL_MANAGER_HH__
diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh
new file mode 100644
index 000000000..092303c00
--- /dev/null
+++ b/src/gpu-compute/qstruct.hh
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Marc Orr
+ */
+
+#ifndef __Q_STRUCT_HH__
+#define __Q_STRUCT_HH__
+
+#include <bitset>
+#include <cstdint>
+
+// Maximum number of arguments
+static const int KER_NUM_ARGS = 32;
+// Kernel argument buffer size
+static const int KER_ARGS_LENGTH = 512;
+
+class LdsChunk;
+struct NDRange;
+
+// Be very careful of alignment in this structure. The structure
+// must compile to the same layout in both 32-bit and 64-bit mode.
+struct HsaQueueEntry
+{
+    // Base pointer for array of instruction pointers
+    uint64_t code_ptr;
+    // Grid Size (3 dimensions)
+    uint32_t gdSize[3];
+    // Workgroup Size (3 dimensions)
+    uint32_t wgSize[3];
+    uint16_t sRegCount;
+    uint16_t dRegCount;
+    uint16_t cRegCount;
+    uint64_t privMemStart;
+    uint32_t privMemPerItem;
+    uint32_t privMemTotal;
+    uint64_t spillMemStart;
+    uint32_t spillMemPerItem;
+    uint32_t spillMemTotal;
+    uint64_t roMemStart;
+    uint32_t roMemTotal;
+    // Size (in bytes) of LDS
+    uint32_t ldsSize;
+    // Virtual Memory Id (unused right now)
+    uint32_t vmId;
+
+    // Pointer to dependency chain (unused now)
+    uint64_t depends;
+
+    // pointer to bool
+    uint64_t addrToNotify;
+    // pointer to uint32_t
+    uint64_t numDispLeft;
+
+    // variables to pass arguments when running in standalone mode,
+    // will be removed when run.py and sh.cpp have been updated to
+    // use args and offset arrays
+    uint64_t arg1;
+    uint64_t arg2;
+    uint64_t arg3;
+    uint64_t arg4;
+
+    // variables to pass arguments when running in cpu+gpu mode
+    uint8_t args[KER_ARGS_LENGTH];
+    uint16_t offsets[KER_NUM_ARGS];
+    uint16_t num_args;
+};
+
+// State used to start (or restart) a WF
+struct WFContext
+{
+    // 32 bit values
+    // barrier state
+    int bar_cnt[VSZ];
+
+    // id (which WF in the WG)
+    int cnt;
+
+    // more barrier state
+    int max_bar_cnt;
+    int old_barrier_cnt;
+    int barrier_cnt;
+
+    // More Program Counter Stuff
+    uint32_t pc;
+
+    // Program counter of the immediate post-dominator instruction
+    uint32_t rpc;
+
+    // WG wide state (I don't see how to avoid redundancy here)
+    int cu_id;
+    uint32_t wg_id;
+    uint32_t barrier_id;
+
+    // 64 bit values (these values depend on the wavefront size)
+    // masks
+    uint64_t init_mask;
+    uint64_t exec_mask;
+
+    // private memory;
+    Addr privBase;
+    Addr spillBase;
+
+    LdsChunk *ldsChunk;
+
+    /*
+     * Kernel wide state
+     * This is a hack. This state should be moved through simulated memory
+     * during a yield. Though not much is being used here, so it's probably
+     * probably not a big deal.
+     *
+     * Just to add to this comment... The ndr is derived from simulated
+     * memory when the cl-runtime allocates an HsaQueueEntry and populates it
+     * for a kernel launch. So in theory the runtime should be able to keep
+     * that state around. Then a WF can reference it upon restart to derive
+     * kernel wide state. The runtime can deallocate the state when the
+     * kernel completes.
+     */
+    NDRange *ndr;
+};
+
+// State that needs to be passed between the simulation and simulated app, a
+// pointer to this struct can be passed through the depends field in the
+// HsaQueueEntry struct
+struct HostState
+{
+    // cl_event* has original HsaQueueEntry for init
+    uint64_t event;
+};
+
+// Total number of HSA queues
+static const int HSAQ_NQUEUES = 8;
+
+// These values will eventually live in memory mapped registers
+// and be settable by the kernel mode driver.
+
+// Number of entries in each HSA queue
+static const int HSAQ_SIZE = 64;
+// Address of first HSA queue index
+static const int HSAQ_INDX_BASE = 0x10000ll;
+// Address of first HSA queue
+static const int HSAQ_BASE = 0x11000ll;
+// Suggested start of HSA code
+static const int HSA_CODE_BASE = 0x18000ll;
+
+// These are shortcuts for deriving the address of a specific
+// HSA queue or queue index
+#define HSAQ(n) (HSAQ_BASE + HSAQ_SIZE * sizeof(struct fsaQueue) * n)
+#define HSAQE(n,i) (HSAQ_BASE + (HSAQ_SIZE * n + i) * sizeof(struct fsaQueue))
+#define HSAQ_RI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 0))
+#define HSAQ_WI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 1))
+#define HSAQ_CI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 2))
+
+/*
+ * Example code for writing to a queue
+ *
+ * void
+ * ToQueue(int n,struct fsaQueue *val)
+ * {
+ *     int wi = *(int*)HSAQ_WI(n);
+ *     int ri = *(int*)HSAQ_RI(n);
+ *     int ci = *(int*)HSAQ_CI(n);
+ *
+ *     if (ci - ri < HSAQ_SIZE) {
+ *         (*(int*)HSAQ_CI(n))++;
+ *         *(HsaQueueEntry*)(HSAQE(n, (wi % HSAQ_SIZE))) = *val;
+ *         (*(int*)HSAQ_WI(n))++;
+ *     }
+ * }
+ */
+
+#endif // __Q_STRUCT_HH__
diff --git a/src/gpu-compute/rr_scheduling_policy.cc b/src/gpu-compute/rr_scheduling_policy.cc
new file mode 100644
index 000000000..5d3591901
--- /dev/null
+++ b/src/gpu-compute/rr_scheduling_policy.cc
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/rr_scheduling_policy.hh"
+
+#include "gpu-compute/wavefront.hh"
+
+Wavefront*
+RRSchedulingPolicy::chooseWave()
+{
+    Wavefront *selectedWave = nullptr;
+
+    // Check to make sure ready list had atleast one schedulable wave
+    if (scheduleList->size()) {
+        // For RR policy, select the wave which is at the
+        // front of the list. The selected wave is popped
+        // out from the schedule list immediately after selection
+        // to avoid starvation. It is the responsibility of the
+        // module invoking the RR scheduler to make surei scheduling
+        // eligible waves are added to the back of the schedule
+        // list
+        selectedWave = scheduleList->front();
+        scheduleList->erase(scheduleList->begin() + 0);
+    } else {
+        panic("Empty ready list");
+    }
+
+    return selectedWave;
+}
+
+void
+RRSchedulingPolicy::bindList(std::vector<Wavefront*> *list)
+{
+    scheduleList = list;
+}
diff --git a/src/gpu-compute/rr_scheduling_policy.hh b/src/gpu-compute/rr_scheduling_policy.hh
new file mode 100644
index 000000000..780f294aa
--- /dev/null
+++ b/src/gpu-compute/rr_scheduling_policy.hh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __RR_SCHEDULING_POLICY_HH__
+#define __RR_SCHEDULING_POLICY_HH__
+
+#include <inttypes.h>
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include "base/misc.hh"
+
+class Wavefront;
+
+// Round-Robin pick among the list of ready waves
+class RRSchedulingPolicy
+{
+  public:
+    RRSchedulingPolicy() : scheduleList(nullptr) { }
+
+    Wavefront* chooseWave();
+    void bindList(std::vector<Wavefront*> *list);
+
+  private:
+    // List of waves which are participating in scheduling.
+    // This scheduler selects one wave from this list based on
+    // round robin policy
+    std::vector<Wavefront*> *scheduleList;
+};
+
+#endif // __RR_SCHEDULING_POLICY_HH__
diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc
new file mode 100644
index 000000000..068136026
--- /dev/null
+++ b/src/gpu-compute/schedule_stage.cc
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/schedule_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+ScheduleStage::ScheduleStage(const ComputeUnitParams *p)
+    : numSIMDs(p->num_SIMDs),
+      numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes)
+{
+    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+        Scheduler newScheduler(p);
+        scheduler.push_back(newScheduler);
+    }
+}
+
+ScheduleStage::~ScheduleStage()
+{
+    scheduler.clear();
+    waveStatusList.clear();
+}
+
+void
+ScheduleStage::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    _name = computeUnit->name() + ".ScheduleStage";
+
+    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+        scheduler[j].bindList(&computeUnit->readyList[j]);
+    }
+
+    for (int j = 0; j < numSIMDs; ++j) {
+        waveStatusList.push_back(&computeUnit->waveStatusList[j]);
+    }
+
+    dispatchList = &computeUnit->dispatchList;
+}
+
+void
+ScheduleStage::arbitrate()
+{
+    // iterate over all Memory pipelines
+    for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) {
+        if (dispatchList->at(j).first) {
+            Wavefront *waveToMemPipe = dispatchList->at(j).first;
+            // iterate over all execution pipelines
+            for (int i = 0; i < numSIMDs + numMemUnits; ++i) {
+                if ((i != j) && (dispatchList->at(i).first)) {
+                    Wavefront *waveToExePipe = dispatchList->at(i).first;
+                    // if the two selected wavefronts are mapped to the same
+                    // SIMD unit then they share the VRF
+                    if (waveToMemPipe->simdId == waveToExePipe->simdId) {
+                        int simdId = waveToMemPipe->simdId;
+                        // Read VRF port arbitration:
+                        // If there are read VRF port conflicts between the
+                        // a memory and another instruction we drop the other
+                        // instruction. We don't need to check for write VRF
+                        // port conflicts because the memory instruction either
+                        // does not need to write to the VRF (store) or will
+                        // write to the VRF when the data comes back (load) in
+                        // which case the arbiter of the memory pipes will
+                        // resolve any conflicts
+                        if (computeUnit->vrf[simdId]->
+                            isReadConflict(waveToMemPipe->wfSlotId,
+                            waveToExePipe->wfSlotId)) {
+                            // FIXME: The "second" member variable is never
+                            // used in the model. I am setting it to READY
+                            // simply to follow the protocol of setting it
+                            // when the WF has an instruction ready to issue
+                            waveStatusList[simdId]->at(waveToExePipe->wfSlotId)
+                                                    .second = READY;
+
+                            dispatchList->at(i).first = nullptr;
+                            dispatchList->at(i).second = EMPTY;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void
+ScheduleStage::exec()
+{
+    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+         uint32_t readyListSize = computeUnit->readyList[j].size();
+
+         // If no wave is ready to be scheduled on the execution resource
+         // then skip scheduling for this execution resource
+         if (!readyListSize) {
+             continue;
+         }
+
+         Wavefront *waveToBeDispatched = scheduler[j].chooseWave();
+         dispatchList->at(j).first = waveToBeDispatched;
+         waveToBeDispatched->updateResources();
+         dispatchList->at(j).second = FILLED;
+
+         waveStatusList[waveToBeDispatched->simdId]->at(
+                 waveToBeDispatched->wfSlotId).second = BLOCKED;
+
+         assert(computeUnit->readyList[j].size() == readyListSize - 1);
+    }
+    // arbitrate over all shared resources among instructions being issued
+    // simultaneously
+    arbitrate();
+}
+
+void
+ScheduleStage::regStats()
+{
+}
diff --git a/src/gpu-compute/schedule_stage.hh b/src/gpu-compute/schedule_stage.hh
new file mode 100644
index 000000000..26eb9a25b
--- /dev/null
+++ b/src/gpu-compute/schedule_stage.hh
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCHEDULE_STAGE_HH__
+#define __SCHEDULE_STAGE_HH__
+
+#include <utility>
+#include <vector>
+
+#include "gpu-compute/exec_stage.hh"
+#include "gpu-compute/scheduler.hh"
+#include "gpu-compute/scoreboard_check_stage.hh"
+
+// Schedule or execution arbitration stage.
+// From the pool of ready waves in the ready list,
+// one wave is selected for each execution resource.
+// The selection is made based on a scheduling policy
+
+class ComputeUnit;
+class Wavefront;
+
+struct ComputeUnitParams;
+
+class ScheduleStage
+{
+  public:
+    ScheduleStage(const ComputeUnitParams *params);
+    ~ScheduleStage();
+    void init(ComputeUnit *cu);
+    void exec();
+    void arbitrate();
+    // Stats related variables and methods
+    std::string name() { return _name; }
+    void regStats();
+
+  private:
+    ComputeUnit *computeUnit;
+    uint32_t numSIMDs;
+    uint32_t numMemUnits;
+
+    // Each execution resource will have its own
+    // scheduler and a dispatch list
+    std::vector<Scheduler> scheduler;
+
+    // Stores the status of waves. A READY implies the
+    // wave is ready to be scheduled this cycle and
+    // is already present in the readyList
+    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
+        waveStatusList;
+
+    // List of waves which will be dispatched to
+    // each execution resource. A FILLED implies
+    // dispatch list is non-empty and
+    // execution unit has something to execute
+    // this cycle. Currently, the dispatch list of
+    // an execution resource can hold only one wave because
+    // an execution resource can execute only one wave in a cycle.
+    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
+
+    std::string _name;
+};
+
+#endif // __SCHEDULE_STAGE_HH__
diff --git a/src/gpu-compute/scheduler.cc b/src/gpu-compute/scheduler.cc
new file mode 100644
index 000000000..1cd0bfe55
--- /dev/null
+++ b/src/gpu-compute/scheduler.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/scheduler.hh"
+
+Scheduler::Scheduler(const ComputeUnitParams *p)
+{
+    if (p->execPolicy  == "OLDEST-FIRST") {
+        schedPolicy = SCHED_POLICY::OF_POLICY;
+    } else if (p->execPolicy  == "ROUND-ROBIN") {
+        schedPolicy = SCHED_POLICY::RR_POLICY;
+    } else {
+        fatal("Unimplemented scheduling policy");
+    }
+}
+
+Wavefront*
+Scheduler::chooseWave()
+{
+    if (schedPolicy == SCHED_POLICY::OF_POLICY) {
+        return OFSchedPolicy.chooseWave();
+    } else if (schedPolicy == SCHED_POLICY::RR_POLICY) {
+        return RRSchedPolicy.chooseWave();
+    } else {
+        fatal("Unimplemented scheduling policy");
+    }
+}
+
+void
+Scheduler::bindList(std::vector<Wavefront*> *list)
+{
+    if (schedPolicy == SCHED_POLICY::OF_POLICY) {
+        OFSchedPolicy.bindList(list);
+    } else if (schedPolicy == SCHED_POLICY::RR_POLICY) {
+        RRSchedPolicy.bindList(list);
+    } else {
+        fatal("Unimplemented scheduling policy");
+    }
+}
diff --git a/src/gpu-compute/scheduler.hh b/src/gpu-compute/scheduler.hh
new file mode 100644
index 000000000..148ec9425
--- /dev/null
+++ b/src/gpu-compute/scheduler.hh
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCHEDULER_HH__
+#define __SCHEDULER_HH__
+
+#include "gpu-compute/of_scheduling_policy.hh"
+#include "gpu-compute/rr_scheduling_policy.hh"
+#include "gpu-compute/scheduling_policy.hh"
+#include "params/ComputeUnit.hh"
+
+enum SCHED_POLICY
+{
+    OF_POLICY = 0,
+    RR_POLICY
+};
+
+class Scheduler
+{
+  public:
+    Scheduler(const ComputeUnitParams *params);
+    Wavefront *chooseWave();
+    void bindList(std::vector<Wavefront*> *list);
+
+  private:
+    SCHED_POLICY schedPolicy;
+    SchedulingPolicy<RRSchedulingPolicy> RRSchedPolicy;
+    SchedulingPolicy<OFSchedulingPolicy> OFSchedPolicy;
+};
+
+#endif // __SCHEDULER_HH__
diff --git a/src/gpu-compute/scheduling_policy.hh b/src/gpu-compute/scheduling_policy.hh
new file mode 100644
index 000000000..b5e923c62
--- /dev/null
+++ b/src/gpu-compute/scheduling_policy.hh
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCHEDULING_POLICY_HH__
+#define __SCHEDULING_POLICY_HH__
+
+#include <vector>
+
+template<typename Impl>
+class SchedulingPolicy
+{
+  public:
+    Wavefront* chooseWave() { return policyImpl.chooseWave(); }
+
+    void
+    bindList(std::vector<Wavefront*> *list)
+    {
+        return policyImpl.bindList(list);
+    }
+
+  private:
+    Impl policyImpl;
+};
+
+#endif // __SCHEDULING_POLICY_HH__
diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc
new file mode 100644
index 000000000..0d856a9b0
--- /dev/null
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/scoreboard_check_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/ComputeUnit.hh"
+
+ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p)
+    : numSIMDs(p->num_SIMDs),
+      numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
+      numGlbMemPipes(p->num_global_mem_pipes),
+      numShrMemPipes(p->num_shared_mem_pipes),
+      vectorAluInstAvail(nullptr),
+      lastGlbMemSimd(-1),
+      lastShrMemSimd(-1), glbMemInstAvail(nullptr),
+      shrMemInstAvail(nullptr)
+{
+}
+
+ScoreboardCheckStage::~ScoreboardCheckStage()
+{
+    readyList.clear();
+    waveStatusList.clear();
+    shrMemInstAvail = nullptr;
+    glbMemInstAvail = nullptr;
+}
+
+void
+ScoreboardCheckStage::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    _name = computeUnit->name() + ".ScoreboardCheckStage";
+
+    for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
+        readyList.push_back(&computeUnit->readyList[unitId]);
+    }
+
+    for (int unitId = 0; unitId < numSIMDs; ++unitId) {
+        waveStatusList.push_back(&computeUnit->waveStatusList[unitId]);
+    }
+
+    vectorAluInstAvail = &computeUnit->vectorAluInstAvail;
+    glbMemInstAvail= &computeUnit->glbMemInstAvail;
+    shrMemInstAvail= &computeUnit->shrMemInstAvail;
+}
+
+void
+ScoreboardCheckStage::initStatistics()
+{
+    lastGlbMemSimd = -1;
+    lastShrMemSimd = -1;
+    *glbMemInstAvail = 0;
+    *shrMemInstAvail = 0;
+
+    for (int unitId = 0; unitId < numSIMDs; ++unitId)
+        vectorAluInstAvail->at(unitId) = false;
+}
+
+void
+ScoreboardCheckStage::collectStatistics(Wavefront *curWave, int unitId)
+{
+    if (curWave->instructionBuffer.empty())
+        return;
+
+    // track which vector SIMD unit has at least one WV with a vector
+    // ALU as the oldest instruction in its Instruction buffer
+    vectorAluInstAvail->at(unitId) = vectorAluInstAvail->at(unitId) ||
+                                     curWave->isOldestInstALU();
+
+    // track how many vector SIMD units have at least one WV with a
+    // vector Global memory instruction as the oldest instruction
+    // in its Instruction buffer
+    if ((curWave->isOldestInstGMem() || curWave->isOldestInstPrivMem() ||
+         curWave->isOldestInstFlatMem()) && lastGlbMemSimd != unitId &&
+        *glbMemInstAvail <= 1) {
+        (*glbMemInstAvail)++;
+        lastGlbMemSimd = unitId;
+    }
+
+    // track how many vector SIMD units have at least one WV with a
+    // vector shared memory (LDS) instruction as the oldest instruction
+    // in its Instruction buffer
+    // TODO: parametrize the limit of the LDS units
+    if (curWave->isOldestInstLMem() && (*shrMemInstAvail <= numShrMemPipes) &&
+        lastShrMemSimd != unitId) {
+        (*shrMemInstAvail)++;
+        lastShrMemSimd = unitId;
+    }
+}
+
+void
+ScoreboardCheckStage::exec()
+{
+    initStatistics();
+
+    // reset the ready list for all execution units; it will be
+    // constructed every cycle since resource availability may change
+    for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
+        readyList[unitId]->clear();
+    }
+
+    // iterate over the Wavefronts of all SIMD units
+    for (int unitId = 0; unitId < numSIMDs; ++unitId) {
+        for (int wvId = 0; wvId < computeUnit->shader->n_wf; ++wvId) {
+            // reset the ready status of each wavefront
+            waveStatusList[unitId]->at(wvId).second = BLOCKED;
+            Wavefront *curWave = waveStatusList[unitId]->at(wvId).first;
+            collectStatistics(curWave, unitId);
+
+            if (curWave->ready(Wavefront::I_ALU)) {
+                readyList[unitId]->push_back(curWave);
+                waveStatusList[unitId]->at(wvId).second = READY;
+            } else if (curWave->ready(Wavefront::I_GLOBAL)) {
+                if (computeUnit->cedeSIMD(unitId, wvId)) {
+                    continue;
+                }
+
+                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
+                waveStatusList[unitId]->at(wvId).second = READY;
+            } else if (curWave->ready(Wavefront::I_SHARED)) {
+                readyList[computeUnit->ShrMemUnitId()]->push_back(curWave);
+                waveStatusList[unitId]->at(wvId).second = READY;
+            } else if (curWave->ready(Wavefront::I_FLAT)) {
+                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
+                waveStatusList[unitId]->at(wvId).second = READY;
+            } else if (curWave->ready(Wavefront::I_PRIVATE)) {
+                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
+                waveStatusList[unitId]->at(wvId).second = READY;
+            }
+        }
+    }
+}
+
+void
+ScoreboardCheckStage::regStats()
+{
+}
diff --git a/src/gpu-compute/scoreboard_check_stage.hh b/src/gpu-compute/scoreboard_check_stage.hh
new file mode 100644
index 000000000..099597afb
--- /dev/null
+++ b/src/gpu-compute/scoreboard_check_stage.hh
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCOREBOARD_CHECK_STAGE_HH__
+#define __SCOREBOARD_CHECK_STAGE_HH__
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+class ComputeUnit;
+class Wavefront;
+
+struct ComputeUnitParams;
+
+enum WAVE_STATUS
+{
+    BLOCKED = 0,
+    READY
+};
+
+/*
+ * Scoreboard check stage.
+ * All wavefronts are analyzed to see if they are ready
+ * to be executed this cycle. Both structural and data
+ * hazards are considered while marking a wave "ready"
+ * for execution. After analysis, the ready waves are
+ * added to readyList.
+ */
+class ScoreboardCheckStage
+{
+  public:
+    ScoreboardCheckStage(const ComputeUnitParams* params);
+    ~ScoreboardCheckStage();
+    void init(ComputeUnit *cu);
+    void exec();
+
+    // Stats related variables and methods
+    const std::string& name() const { return _name; }
+    void regStats();
+
+  private:
+    void collectStatistics(Wavefront *curWave, int unitId);
+    void initStatistics();
+    ComputeUnit *computeUnit;
+    uint32_t numSIMDs;
+    uint32_t numMemUnits;
+    uint32_t numGlbMemPipes;
+    uint32_t numShrMemPipes;
+
+    // flag per vector SIMD unit that is set when there is at least one
+    // WF that has a vector ALU instruction as the oldest in its
+    // Instruction Buffer
+    std::vector<bool> *vectorAluInstAvail;
+    int lastGlbMemSimd;
+    int lastShrMemSimd;
+
+    int *glbMemInstAvail;
+    int *shrMemInstAvail;
+    // List of waves which are ready to be scheduled.
+    // Each execution resource has a ready list
+    std::vector<std::vector<Wavefront*>*> readyList;
+
+    // Stores the status of waves. A READY implies the
+    // wave is ready to be scheduled this cycle and
+    // is already present in the readyList
+    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
+        waveStatusList;
+
+    std::string _name;
+};
+
+#endif // __SCOREBOARD_CHECK_STAGE_HH__
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
new file mode 100644
index 000000000..e8d7946ff
--- /dev/null
+++ b/src/gpu-compute/shader.cc
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "gpu-compute/shader.hh"
+
+#include <limits>
+
+#include "arch/x86/linux/linux.hh"
+#include "base/chunk_generator.hh"
+#include "debug/GPUDisp.hh"
+#include "debug/GPUMem.hh"
+#include "debug/HSAIL.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "sim/sim_exit.hh"
+
+Shader::Shader(const Params *p) : SimObject(p),
+    clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr),
+    cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing),
+    hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync),
+    separate_acquire_release(p->separate_acquire_release), coissue_return(1),
+    trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
+    globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
+    box_tick_cnt(0), start_tick_cnt(0)
+{
+
+    cuList.resize(n_cu);
+
+    for (int i = 0; i < n_cu; ++i) {
+        cuList[i] = p->CUs[i];
+        assert(i == cuList[i]->cu_id);
+        cuList[i]->shader = this;
+    }
+}
+
+Addr
+Shader::mmap(int length)
+{
+
+    Addr start;
+
+    // round up length to the next page
+    length = roundUp(length, TheISA::PageBytes);
+
+    if (X86Linux64::mmapGrowsDown()) {
+        DPRINTF(HSAIL, "GROWS DOWN");
+        start = gpuTc->getProcessPtr()->mmap_end -length;
+        gpuTc->getProcessPtr()->mmap_end = start;
+    } else {
+        DPRINTF(HSAIL, "GROWS UP");
+        start = gpuTc->getProcessPtr()->mmap_end;
+        gpuTc->getProcessPtr()->mmap_end += length;
+
+        // assertion to make sure we don't overwrite the stack (it grows down)
+        assert(gpuTc->getProcessPtr()->mmap_end <
+                gpuTc->getProcessPtr()->stack_base -
+                gpuTc->getProcessPtr()->max_stack_size);
+
+    }
+
+    DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
+
+    gpuTc->getProcessPtr()->allocateMem(start,length);
+
+    return start;
+}
+
+void
+Shader::init()
+{
+    // grab the threadContext of the thread running on the CPU
+    assert(cpuPointer);
+    gpuTc = cpuPointer->getContext(0);
+    assert(gpuTc);
+}
+
+Shader::~Shader()
+{
+    for (int j = 0; j < n_cu; ++j)
+        delete cuList[j];
+}
+
+void
+Shader::updateThreadContext(int tid) {
+    // thread context of the thread which dispatched work
+    assert(cpuPointer);
+    gpuTc = cpuPointer->getContext(tid);
+    assert(gpuTc);
+}
+
+void
+Shader::hostWakeUp(BaseCPU *cpu) {
+    if (cpuPointer == cpu) {
+        if (gpuTc->status() == ThreadContext::Suspended)
+            cpu->activateContext(gpuTc->threadId());
+    } else {
+        //Make sure both dispatcher and shader are trying to
+        //wakeup same host. Hack here to enable kernel launch
+        //from multiple CPUs
+        panic("Dispatcher wants to wakeup a different host");
+    }
+}
+
+Shader*
+ShaderParams::create()
+{
+    return new Shader(this);
+}
+
+void
+Shader::exec()
+{
+    tick_cnt = curTick();
+    box_tick_cnt = curTick() - start_tick_cnt;
+
+    // apply any scheduled adds
+    for (int i = 0; i < sa_n; ++i) {
+        if (sa_when[i] <= tick_cnt) {
+            *sa_val[i] += sa_x[i];
+            sa_val.erase(sa_val.begin() + i);
+            sa_x.erase(sa_x.begin() + i);
+            sa_when.erase(sa_when.begin() + i);
+            --sa_n;
+            --i;
+        }
+    }
+
+    // clock all of the cu's
+    for (int i = 0; i < n_cu; ++i)
+        cuList[i]->exec();
+}
+
+bool
+Shader::dispatch_workgroups(NDRange *ndr)
+{
+    bool scheduledSomething = false;
+    int cuCount = 0;
+    int curCu = nextSchedCu;
+
+    while (cuCount < n_cu) {
+        //Every time we try a CU, update nextSchedCu
+        nextSchedCu = (nextSchedCu + 1) % n_cu;
+
+        // dispatch workgroup iff the following two conditions are met:
+        // (a) wg_rem is true - there are unassigned workgroups in the grid
+        // (b) there are enough free slots in cu cuList[i] for this wg
+        if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
+            scheduledSomething = true;
+            DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
+
+            // ticks() member function translates cycles to simulation ticks.
+            if (!tickEvent.scheduled()) {
+                schedule(tickEvent, curTick() + this->ticks(1));
+            }
+
+            cuList[curCu]->StartWorkgroup(ndr);
+            ndr->wgId[0]++;
+            ndr->globalWgId++;
+            if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
+                ndr->wgId[0] = 0;
+                ndr->wgId[1]++;
+
+                if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
+                    ndr->wgId[1] = 0;
+                    ndr->wgId[2]++;
+
+                    if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
+                        ndr->wg_disp_rem = false;
+                        break;
+                    }
+                }
+            }
+        }
+
+        ++cuCount;
+        curCu = nextSchedCu;
+    }
+
+    return scheduledSomething;
+}
+
+void
+Shader::handshake(GpuDispatcher *_dispatcher)
+{
+    dispatcher = _dispatcher;
+}
+
+void
+Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
+                           bool suppress_func_errors, int cu_id)
+{
+    unsigned block_size = RubySystem::getBlockSizeBytes();
+    unsigned size = req->getSize();
+
+    Addr tmp_addr;
+    BaseTLB::Mode trans_mode;
+
+    if (cmd == MemCmd::ReadReq) {
+        trans_mode = BaseTLB::Read;
+    } else if (cmd == MemCmd::WriteReq) {
+        trans_mode = BaseTLB::Write;
+    } else {
+        fatal("unexcepted MemCmd\n");
+    }
+
+    tmp_addr = req->getVaddr();
+    Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
+
+    assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
+
+    // Misaligned access
+    if (split_addr > tmp_addr) {
+        RequestPtr req1, req2;
+        req->splitOnVaddr(split_addr, req1, req2);
+
+
+        PacketPtr pkt1 = new Packet(req2, cmd);
+        PacketPtr pkt2 = new Packet(req1, cmd);
+
+        functionalTLBAccess(pkt1, cu_id, trans_mode);
+        functionalTLBAccess(pkt2, cu_id, trans_mode);
+
+        PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
+        PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
+
+        new_pkt1->dataStatic(data);
+        new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
+
+        if (suppress_func_errors) {
+            new_pkt1->setSuppressFuncError();
+            new_pkt2->setSuppressFuncError();
+        }
+
+        // fixme: this should be cuList[cu_id] if cu_id != n_cu
+        // The latter requires a memPort in the dispatcher
+        cuList[0]->memPort[0]->sendFunctional(new_pkt1);
+        cuList[0]->memPort[0]->sendFunctional(new_pkt2);
+
+        delete new_pkt1;
+        delete new_pkt2;
+        delete pkt1;
+        delete pkt2;
+    } else {
+        PacketPtr pkt = new Packet(req, cmd);
+        functionalTLBAccess(pkt, cu_id, trans_mode);
+        PacketPtr new_pkt = new Packet(pkt->req, cmd);
+        new_pkt->dataStatic(data);
+
+        if (suppress_func_errors) {
+            new_pkt->setSuppressFuncError();
+        };
+
+        // fixme: this should be cuList[cu_id] if cu_id != n_cu
+        // The latter requires a memPort in the dispatcher
+        cuList[0]->memPort[0]->sendFunctional(new_pkt);
+
+        delete new_pkt;
+        delete pkt;
+    }
+}
+
+bool
+Shader::busy()
+{
+    for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
+        if (!cuList[i_cu]->isDone()) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+void
+Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
+{
+    sa_val.push_back(val);
+    sa_when.push_back(tick_cnt + when);
+    sa_x.push_back(x);
+    ++sa_n;
+}
+
+Shader::TickEvent::TickEvent(Shader *_shader)
+    : Event(CPU_Tick_Pri), shader(_shader)
+{
+}
+
+
+void
+Shader::TickEvent::process()
+{
+    if (shader->busy()) {
+        shader->exec();
+        shader->schedule(this, curTick() + shader->ticks(1));
+    }
+}
+
+const char*
+Shader::TickEvent::description() const
+{
+    return "Shader tick";
+}
+
+void
+Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+                  MemCmd cmd, bool suppress_func_errors)
+{
+    uint8_t *data_buf = (uint8_t*)ptr;
+
+    for (ChunkGenerator gen(address, size, RubySystem::getBlockSizeBytes());
+         !gen.done(); gen.next()) {
+        Request *req = new Request(0, gen.addr(), gen.size(), 0,
+                                   cuList[0]->masterId(), 0, 0, 0);
+
+        doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
+        data_buf += gen.size();
+        delete req;
+    }
+}
+
+void
+Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
+{
+    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
+}
+
+void
+Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+                bool suppress_func_errors)
+{
+    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
+}
+
+void
+Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
+{
+    AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
+}
+
+void
+Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+                 bool suppress_func_errors)
+{
+    AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
+              suppress_func_errors);
+}
+
+/*
+ * Send a packet through the appropriate TLB functional port.
+ * If cu_id=n_cu, then this is the dispatcher's TLB.
+ * Otherwise it's the TLB of the cu_id compute unit.
+ */
+void
+Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
+{
+    // update senderState. Need to know the gpuTc and the TLB mode
+    pkt->senderState =
+        new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
+
+    if (cu_id == n_cu) {
+        dispatcher->tlbPort->sendFunctional(pkt);
+    } else {
+        // even when the perLaneTLB flag is turned on
+        // it's ok tp send all accesses through lane 0
+        // since the lane # is not known here,
+        // This isn't important since these are functional accesses.
+        cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
+    }
+
+    /* safe_cast the senderState */
+    TheISA::GpuTLB::TranslationState *sender_state =
+               safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    delete sender_state->tlbEntry;
+    delete pkt->senderState;
+}
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
new file mode 100644
index 000000000..91ea8aae0
--- /dev/null
+++ b/src/gpu-compute/shader.hh
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __SHADER_HH__
+#define __SHADER_HH__
+
+#include <functional>
+#include <string>
+
+#include "arch/isa.hh"
+#include "arch/isa_traits.hh"
+#include "base/types.hh"
+#include "cpu/simple/atomic.hh"
+#include "cpu/simple/timing.hh"
+#include "cpu/simple_thread.hh"
+#include "cpu/thread_context.hh"
+#include "cpu/thread_state.hh"
+#include "enums/MemOpType.hh"
+#include "enums/MemType.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_tlb.hh"
+#include "gpu-compute/lds_state.hh"
+#include "gpu-compute/qstruct.hh"
+#include "mem/page_table.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/Shader.hh"
+#include "sim/faults.hh"
+#include "sim/process.hh"
+#include "sim/sim_object.hh"
+
+class BaseTLB;
+class GpuDispatcher;
+
+namespace TheISA
+{
+    class GpuTLB;
+}
+
+static const int LDS_SIZE = 65536;
+
+// Class Shader: This describes a single shader instance. Most
+// configurations will only have a single shader.
+
+class Shader : public SimObject
+{
+  protected:
+      // Shader's clock period in terms of number of ticks of curTime,
+      // aka global simulation clock
+      Tick clock;
+
+  public:
+    typedef ShaderParams Params;
+    enum hsail_mode_e {SIMT,VECTOR_SCALAR};
+
+    // clock related functions ; maps to-and-from
+    // Simulation ticks and shader clocks.
+    Tick frequency() const { return SimClock::Frequency / clock; }
+
+    Tick ticks(int numCycles) const { return  (Tick)clock * numCycles; }
+
+    Tick getClock() const { return clock; }
+    Tick curCycle() const { return curTick() / clock; }
+    Tick tickToCycles(Tick val) const { return val / clock;}
+
+
+    SimpleThread *cpuThread;
+    ThreadContext *gpuTc;
+    BaseCPU *cpuPointer;
+
+    class TickEvent : public Event
+    {
+      private:
+        Shader *shader;
+
+      public:
+        TickEvent(Shader*);
+        void process();
+        const char* description() const;
+    };
+
+    TickEvent tickEvent;
+
+    // is this simulation going to be timing mode in the memory?
+    bool timingSim;
+    hsail_mode_e hsail_mode;
+
+    // If set, issue acq packet @ kernel launch
+    int impl_kern_boundary_sync;
+    // If set, generate a separate packet for acquire/release on
+    // ld_acquire/st_release/atomic operations
+    int separate_acquire_release;
+    // If set, fetch returns may be coissued with instructions
+    int coissue_return;
+    // If set, always dump all 64 gprs to trace
+    int trace_vgpr_all;
+    // Number of cu units in the shader
+    int n_cu;
+    // Number of wavefront slots per cu
+    int n_wf;
+    // The size of global memory
+    int globalMemSize;
+
+    /*
+     * Bytes/work-item for call instruction
+     * The number of arguments for an hsail function will
+     * vary. We simply determine the maximum # of arguments
+     * required by any hsail function up front before the
+     * simulation (during parsing of the Brig) and record
+     * that number here.
+     */
+    int funcargs_size;
+
+    // Tracks CU that rr dispatcher should attempt scheduling
+    int nextSchedCu;
+
+    // Size of scheduled add queue
+    uint32_t sa_n;
+
+    // Pointer to value to be increments
+    std::vector<uint32_t*> sa_val;
+    // When to do the increment
+    std::vector<uint64_t> sa_when;
+    // Amount to increment by
+    std::vector<int32_t> sa_x;
+
+    // List of Compute Units (CU's)
+    std::vector<ComputeUnit*> cuList;
+
+    uint64_t tick_cnt;
+    uint64_t box_tick_cnt;
+    uint64_t start_tick_cnt;
+
+    GpuDispatcher *dispatcher;
+
+    Shader(const Params *p);
+    ~Shader();
+    virtual void init();
+
+    // Run shader
+    void exec();
+
+    // Check to see if shader is busy
+    bool busy();
+
+    // Schedule a 32-bit value to be incremented some time in the future
+    void ScheduleAdd(uint32_t *val, Tick when, int x);
+    bool processTimingPacket(PacketPtr pkt);
+
+    void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+                   MemCmd cmd, bool suppress_func_errors);
+
+    void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
+
+    void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
+                 bool suppress_func_errors);
+
+    void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
+
+    void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
+                  bool suppress_func_errors);
+
+    void doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
+                            bool suppress_func_errors, int cu_id);
+
+    void
+    registerCU(int cu_id, ComputeUnit *compute_unit)
+    {
+        cuList[cu_id] = compute_unit;
+    }
+
+    void handshake(GpuDispatcher *dispatcher);
+    bool dispatch_workgroups(NDRange *ndr);
+    Addr mmap(int length);
+    void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
+    void updateThreadContext(int tid);
+    void hostWakeUp(BaseCPU *cpu);
+};
+
+#endif // __SHADER_HH__
diff --git a/src/gpu-compute/simple_pool_manager.cc b/src/gpu-compute/simple_pool_manager.cc
new file mode 100644
index 000000000..0e35ab9cc
--- /dev/null
+++ b/src/gpu-compute/simple_pool_manager.cc
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/simple_pool_manager.hh"
+
+#include "base/misc.hh"
+
+// return the min number of elements that the manager can reserve given
+// a request for "size" elements
+uint32_t
+SimplePoolManager::minAllocatedElements(uint32_t size)
+{
+    fatal_if(size <= 0 || size > poolSize(), "Illegal VGPR region size=%d\n",
+             size);
+
+    return size % minAllocation() > 0 ?
+        (minAllocation() - (size % minAllocation())) + size : size;
+}
+
+std::string
+SimplePoolManager::printRegion()
+{
+    std::string _cout;
+    if (_reservedGroups == 0)
+        _cout = "VRF is empty\n";
+    else if (_reservedGroups > 0) {
+        uint32_t reservedEntries = _reservedGroups * _regionSize;
+        _cout = "VRF reserves " + std::to_string(reservedEntries) + " VGPRs\n";
+    }
+
+    return _cout;
+}
+
+bool
+SimplePoolManager::canAllocate(uint32_t numRegions, uint32_t size)
+{
+    assert(numRegions * minAllocatedElements(size) <= poolSize());
+
+    return _reservedGroups == 0;
+}
+
+void
+SimplePoolManager::freeRegion(uint32_t firstIdx, uint32_t lastIdx)
+{
+    assert(_reservedGroups > 0);
+    --_reservedGroups;
+
+    if (!_reservedGroups)
+        _nxtFreeIdx = 0;
+}
+
+uint32_t
+SimplePoolManager::allocateRegion(const uint32_t size,
+                                  uint32_t *reservedPoolSize)
+{
+    uint32_t actualSize = minAllocatedElements(size);
+    uint32_t startIdx = _nxtFreeIdx;
+    _nxtFreeIdx += actualSize;
+    _regionSize = actualSize;
+    assert(_nxtFreeIdx < poolSize());
+    *reservedPoolSize = actualSize;
+    ++_reservedGroups;
+
+    return startIdx;
+}
+
+uint32_t
+SimplePoolManager::regionSize(std::pair<uint32_t, uint32_t> &region)
+{
+    bool wrapAround = (region.first > region.second);
+    if (!wrapAround) {
+        return region.second - region.first + 1;
+    } else {
+        return region.second + poolSize() - region.first + 1;
+    }
+}
diff --git a/src/gpu-compute/simple_pool_manager.hh b/src/gpu-compute/simple_pool_manager.hh
new file mode 100644
index 000000000..1d4174da8
--- /dev/null
+++ b/src/gpu-compute/simple_pool_manager.hh
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __SIMPLE_POOL_MANAGER_HH__
+#define __SIMPLE_POOL_MANAGER_HH__
+
+#include <cassert>
+#include <cstdint>
+
+#include "gpu-compute/pool_manager.hh"
+
+// Simple Pool Manager: allows one region per pool. No region merging is
+// supported.
+class SimplePoolManager : public PoolManager
+{
+  public:
+    SimplePoolManager(uint32_t minAlloc, uint32_t poolSize)
+        : PoolManager(minAlloc, poolSize), _regionSize(0), _nxtFreeIdx(0),
+          _reservedGroups(0)
+    {
+    }
+
+    uint32_t minAllocatedElements(uint32_t size);
+    std::string printRegion();
+    bool canAllocate(uint32_t numRegions, uint32_t size);
+    uint32_t allocateRegion(const uint32_t size, uint32_t *reservedPoolSize);
+    void freeRegion(uint32_t firstIdx, uint32_t lastIdx);
+    uint32_t regionSize(std::pair<uint32_t,uint32_t> &region);
+
+  private:
+    // actual size of a region (normalized to the minimum size that can
+    // be reserved)
+    uint32_t _regionSize;
+    // next index to allocate a region
+    uint8_t _nxtFreeIdx;
+    // number of groups that reserve a region
+    uint32_t _reservedGroups;
+};
+
+#endif // __SIMPLE_POOL_MANAGER_HH__
diff --git a/src/gpu-compute/tlb_coalescer.cc b/src/gpu-compute/tlb_coalescer.cc
new file mode 100644
index 000000000..835d7b740
--- /dev/null
+++ b/src/gpu-compute/tlb_coalescer.cc
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#include "gpu-compute/tlb_coalescer.hh"
+
+#include <cstring>
+
+#include "debug/GPUTLB.hh"
+
+TLBCoalescer::TLBCoalescer(const Params *p) : MemObject(p),
+    clock(p->clk_domain->clockPeriod()), TLBProbesPerCycle(p->probesPerCycle),
+    coalescingWindow(p->coalescingWindow),
+    disableCoalescing(p->disableCoalescing), probeTLBEvent(this),
+    cleanupEvent(this)
+{
+    // create the slave ports based on the number of connected ports
+    for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
+        cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
+                                              this, i));
+    }
+
+    // create the master ports based on the number of connected ports
+    for (size_t i = 0; i < p->port_master_connection_count; ++i) {
+        memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
+                                              this, i));
+    }
+}
+
+BaseSlavePort&
+TLBCoalescer::getSlavePort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "slave") {
+        if (idx >= static_cast<PortID>(cpuSidePort.size())) {
+            panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx);
+        }
+
+        return *cpuSidePort[idx];
+    } else {
+        panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name);
+    }
+}
+
+BaseMasterPort&
+TLBCoalescer::getMasterPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "master") {
+        if (idx >= static_cast<PortID>(memSidePort.size())) {
+            panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx);
+        }
+
+        return *memSidePort[idx];
+    } else {
+        panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name);
+    }
+}
+
+/*
+ * This method returns true if the <incoming_pkt>
+ * can be coalesced with <coalesced_pkt> and false otherwise.
+ * A given set of rules is checked.
+ * The rules can potentially be modified based on the TLB level.
+ */
+bool
+TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
+{
+    if (disableCoalescing)
+        return false;
+
+    TheISA::GpuTLB::TranslationState *incoming_state =
+      safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState);
+
+    TheISA::GpuTLB::TranslationState *coalesced_state =
+     safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState);
+
+    // Rule 1: Coalesce requests only if they
+    // fall within the same virtual page
+    Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
+                                             TheISA::PageBytes);
+
+    Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
+                                              TheISA::PageBytes);
+
+    if (incoming_virt_page_addr != coalesced_virt_page_addr)
+        return false;
+
+    //* Rule 2: Coalesce requests only if they
+    // share a TLB Mode, i.e. they are both read
+    // or write requests.
+    BaseTLB::Mode incoming_mode = incoming_state->tlbMode;
+    BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode;
+
+    if (incoming_mode != coalesced_mode)
+        return false;
+
+    // when we can coalesce a packet update the reqCnt
+    // that is the number of packets represented by
+    // this coalesced packet
+    if (!incoming_state->prefetch)
+        coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
+
+    return true;
+}
+
+/*
+ * We need to update the physical addresses of all the translation requests
+ * that were coalesced into the one that just returned.
+ */
+void
+TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
+{
+    Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
+
+    DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
+            issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
+
+    TheISA::GpuTLB::TranslationState *sender_state =
+        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    TheISA::GpuTlbEntry *tlb_entry = sender_state->tlbEntry;
+    assert(tlb_entry);
+    Addr first_entry_vaddr = tlb_entry->vaddr;
+    Addr first_entry_paddr = tlb_entry->paddr;
+    int page_size = tlb_entry->size();
+    bool uncacheable = tlb_entry->uncacheable;
+    int first_hit_level = sender_state->hitLevel;
+    bool valid = tlb_entry->valid;
+
+    // Get the physical page address of the translated request
+    // Using the page_size specified in the TLBEntry allows us
+    // to support different page sizes.
+    Addr phys_page_paddr = pkt->req->getPaddr();
+    phys_page_paddr &= ~(page_size - 1);
+
+    for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
+        PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
+        TheISA::GpuTLB::TranslationState *sender_state =
+            safe_cast<TheISA::GpuTLB::TranslationState*>(
+                    local_pkt->senderState);
+
+        // we are sending the packet back, so pop the reqCnt associated
+        // with this level in the TLB hiearchy
+        if (!sender_state->prefetch)
+            sender_state->reqCnt.pop_back();
+
+        /*
+         * Only the first packet from this coalesced request has been
+         * translated. Grab the translated phys. page addr and update the
+         * physical addresses of the remaining packets with the appropriate
+         * page offsets.
+         */
+        if (i) {
+            Addr paddr = phys_page_paddr;
+            paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
+            local_pkt->req->setPaddr(paddr);
+
+            if (uncacheable)
+                local_pkt->req->setFlags(Request::UNCACHEABLE);
+
+            // update senderState->tlbEntry, so we can insert
+            // the correct TLBEentry in the TLBs above.
+            sender_state->tlbEntry =
+                new TheISA::GpuTlbEntry(0, first_entry_vaddr, first_entry_paddr,
+                                        valid);
+
+            // update the hitLevel for all uncoalesced reqs
+            // so that each packet knows where it hit
+            // (used for statistics in the CUs)
+            sender_state->hitLevel = first_hit_level;
+        }
+
+        SlavePort *return_port = sender_state->ports.back();
+        sender_state->ports.pop_back();
+
+        // Translation is done - Convert to a response pkt if necessary and
+        // send the translation back
+        if (local_pkt->isRequest()) {
+            local_pkt->makeTimingResponse();
+        }
+
+        return_port->sendTimingResp(local_pkt);
+    }
+
+    // schedule clean up for end of this cycle
+    // This is a maximum priority event and must be on
+    // the same cycle as GPUTLB cleanup event to prevent
+    // race conditions with an IssueProbeEvent caused by
+    // MemSidePort::recvReqRetry
+    cleanupQueue.push(virt_page_addr);
+
+    if (!cleanupEvent.scheduled())
+        schedule(cleanupEvent, curTick());
+}
+
+// Receive translation requests, create a coalesced request,
+// and send them to the TLB (TLBProbesPerCycle)
+bool
+TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
+{
+    // first packet of a coalesced request
+    PacketPtr first_packet = nullptr;
+    // true if we are able to do coalescing
+    bool didCoalesce = false;
+    // number of coalesced reqs for a given window
+    int coalescedReq_cnt = 0;
+
+    TheISA::GpuTLB::TranslationState *sender_state =
+        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    // push back the port to remember the path back
+    sender_state->ports.push_back(this);
+
+    bool update_stats = !sender_state->prefetch;
+
+    if (update_stats) {
+        // if reqCnt is empty then this packet does not represent
+        // multiple uncoalesced reqs(pkts) but just a single pkt.
+        // If it does though then the reqCnt for each level in the
+        // hierarchy accumulates the total number of reqs this packet
+        // represents
+        int req_cnt = 1;
+
+        if (!sender_state->reqCnt.empty())
+            req_cnt = sender_state->reqCnt.back();
+
+        sender_state->reqCnt.push_back(req_cnt);
+
+        // update statistics
+        coalescer->uncoalescedAccesses++;
+        req_cnt = sender_state->reqCnt.back();
+        DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
+        coalescer->queuingCycles -= (curTick() * req_cnt);
+        coalescer->localqueuingCycles -= curTick();
+    }
+
+    // FIXME if you want to coalesce not based on the issueTime
+    // of the packets (i.e., from the compute unit's perspective)
+    // but based on when they reached this coalescer then
+    // remove the following if statement and use curTick() or
+    // coalescingWindow for the tick_index.
+    if (!sender_state->issueTime)
+       sender_state->issueTime = curTick();
+
+    // The tick index is used as a key to the coalescerFIFO hashmap.
+    // It is shared by all candidates that fall within the
+    // given coalescingWindow.
+    int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
+
+    if (coalescer->coalescerFIFO.count(tick_index)) {
+        coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
+    }
+
+    // see if we can coalesce the incoming pkt with another
+    // coalesced request with the same tick_index
+    for (int i = 0; i < coalescedReq_cnt; ++i) {
+        first_packet = coalescer->coalescerFIFO[tick_index][i][0];
+
+        if (coalescer->canCoalesce(pkt, first_packet)) {
+            coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
+
+            DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
+                    i, tick_index,
+                    coalescer->coalescerFIFO[tick_index][i].size());
+
+            didCoalesce = true;
+            break;
+        }
+    }
+
+    // if this is the first request for this tick_index
+    // or we did not manage to coalesce, update stats
+    // and make necessary allocations.
+    if (!coalescedReq_cnt || !didCoalesce) {
+        if (update_stats)
+            coalescer->coalescedAccesses++;
+
+        std::vector<PacketPtr> new_array;
+        new_array.push_back(pkt);
+        coalescer->coalescerFIFO[tick_index].push_back(new_array);
+
+        DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
+                "push\n", tick_index,
+                coalescer->coalescerFIFO[tick_index].size());
+    }
+
+    //schedule probeTLBEvent next cycle to send the
+    //coalesced requests to the TLB
+    if (!coalescer->probeTLBEvent.scheduled()) {
+        coalescer->schedule(coalescer->probeTLBEvent,
+                curTick() + coalescer->ticks(1));
+    }
+
+    return true;
+}
+
+void
+TLBCoalescer::CpuSidePort::recvReqRetry()
+{
+    assert(false);
+}
+
+void
+TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
+{
+
+    TheISA::GpuTLB::TranslationState *sender_state =
+        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    bool update_stats = !sender_state->prefetch;
+
+    if (update_stats)
+        coalescer->uncoalescedAccesses++;
+
+    // If there is a pending timing request for this virtual address
+    // print a warning message. This is a temporary caveat of
+    // the current simulator where atomic and timing requests can
+    // coexist. FIXME remove this check/warning in the future.
+    Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
+    int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
+
+    if (map_count) {
+        DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
+                "req. pending\n", virt_page_addr);
+    }
+
+    coalescer->memSidePort[0]->sendFunctional(pkt);
+}
+
+AddrRangeList
+TLBCoalescer::CpuSidePort::getAddrRanges() const
+{
+    // currently not checked by the master
+    AddrRangeList ranges;
+
+    return ranges;
+}
+
+bool
+TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt)
+{
+    // a translation completed and returned
+    coalescer->updatePhysAddresses(pkt);
+
+    return true;
+}
+
+void
+TLBCoalescer::MemSidePort::recvReqRetry()
+{
+    //we've receeived a retry. Schedule a probeTLBEvent
+    if (!coalescer->probeTLBEvent.scheduled())
+        coalescer->schedule(coalescer->probeTLBEvent,
+                curTick() + coalescer->ticks(1));
+}
+
+void
+TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt)
+{
+    fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
+}
+
+TLBCoalescer::IssueProbeEvent::IssueProbeEvent(TLBCoalescer * _coalescer)
+    : Event(CPU_Tick_Pri), coalescer(_coalescer)
+{
+}
+
+const char*
+TLBCoalescer::IssueProbeEvent::description() const
+{
+    return "Probe the TLB below";
+}
+
+/*
+ * Here we scan the coalescer FIFO and issue the max
+ * number of permitted probes to the TLB below. We
+ * permit bypassing of coalesced requests for the same
+ * tick_index.
+ *
+ * We do not access the next tick_index unless we've
+ * drained the previous one. The coalesced requests
+ * that are successfully sent are moved to the
+ * issuedTranslationsTable table (the table which keeps
+ * track of the outstanding reqs)
+ */
+void
+TLBCoalescer::IssueProbeEvent::process()
+{
+    // number of TLB probes sent so far
+    int sent_probes = 0;
+    // rejected denotes a blocking event
+    bool rejected = false;
+
+    // It is set to true either when the recvTiming of the TLB below
+    // returns false or when there is another outstanding request for the
+    // same virt. page.
+
+    DPRINTF(GPUTLB, "triggered TLBCoalescer IssueProbeEvent\n");
+
+    for (auto iter = coalescer->coalescerFIFO.begin();
+         iter != coalescer->coalescerFIFO.end() && !rejected; ) {
+        int coalescedReq_cnt = iter->second.size();
+        int i = 0;
+        int vector_index = 0;
+
+        DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
+               coalescedReq_cnt, iter->first);
+
+        while (i < coalescedReq_cnt) {
+            ++i;
+            PacketPtr first_packet = iter->second[vector_index][0];
+
+            // compute virtual page address for this request
+            Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
+                    TheISA::PageBytes);
+
+            // is there another outstanding request for the same page addr?
+            int pending_reqs =
+                coalescer->issuedTranslationsTable.count(virt_page_addr);
+
+            if (pending_reqs) {
+                DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
+                        "page %#x\n", virt_page_addr);
+
+                ++vector_index;
+                rejected = true;
+
+                continue;
+            }
+
+            // send the coalesced request for virt_page_addr
+            if (!coalescer->memSidePort[0]->sendTimingReq(first_packet)) {
+                DPRINTF(GPUTLB, "Failed to send TLB request for page %#x",
+                       virt_page_addr);
+
+                // No need for a retries queue since we are already buffering
+                // the coalesced request in coalescerFIFO.
+                rejected = true;
+                ++vector_index;
+            } else {
+                TheISA::GpuTLB::TranslationState *tmp_sender_state =
+                    safe_cast<TheISA::GpuTLB::TranslationState*>
+                    (first_packet->senderState);
+
+                bool update_stats = !tmp_sender_state->prefetch;
+
+                if (update_stats) {
+                    // req_cnt is total number of packets represented
+                    // by the one we just sent counting all the way from
+                    // the top of TLB hiearchy (i.e., from the CU)
+                    int req_cnt = tmp_sender_state->reqCnt.back();
+                    coalescer->queuingCycles += (curTick() * req_cnt);
+
+                    DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
+                            coalescer->name(), req_cnt);
+
+                    // pkt_cnt is number of packets we coalesced into the one
+                    // we just sent but only at this coalescer level
+                    int pkt_cnt = iter->second[vector_index].size();
+                    coalescer->localqueuingCycles += (curTick() * pkt_cnt);
+                }
+
+                DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
+                       virt_page_addr);
+
+                //copy coalescedReq to issuedTranslationsTable
+                coalescer->issuedTranslationsTable[virt_page_addr]
+                    = iter->second[vector_index];
+
+                //erase the entry of this coalesced req
+                iter->second.erase(iter->second.begin() + vector_index);
+
+                if (iter->second.empty())
+                    assert(i == coalescedReq_cnt);
+
+                sent_probes++;
+                if (sent_probes == coalescer->TLBProbesPerCycle)
+                   return;
+            }
+        }
+
+        //if there are no more coalesced reqs for this tick_index
+        //erase the hash_map with the first iterator
+        if (iter->second.empty()) {
+            coalescer->coalescerFIFO.erase(iter++);
+        } else {
+            ++iter;
+        }
+    }
+}
+
+TLBCoalescer::CleanupEvent::CleanupEvent(TLBCoalescer* _coalescer)
+    : Event(Maximum_Pri), coalescer(_coalescer)
+{
+}
+
+const char*
+TLBCoalescer::CleanupEvent::description() const
+{
+    return "Cleanup issuedTranslationsTable hashmap";
+}
+
+void
+TLBCoalescer::CleanupEvent::process()
+{
+    while (!coalescer->cleanupQueue.empty()) {
+        Addr cleanup_addr = coalescer->cleanupQueue.front();
+        coalescer->cleanupQueue.pop();
+        coalescer->issuedTranslationsTable.erase(cleanup_addr);
+
+        DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
+                cleanup_addr);
+    }
+}
+
+void
+TLBCoalescer::regStats()
+{
+    uncoalescedAccesses
+        .name(name() + ".uncoalesced_accesses")
+        .desc("Number of uncoalesced TLB accesses")
+        ;
+
+    coalescedAccesses
+        .name(name() + ".coalesced_accesses")
+        .desc("Number of coalesced TLB accesses")
+        ;
+
+    queuingCycles
+        .name(name() + ".queuing_cycles")
+        .desc("Number of cycles spent in queue")
+        ;
+
+    localqueuingCycles
+        .name(name() + ".local_queuing_cycles")
+        .desc("Number of cycles spent in queue for all incoming reqs")
+        ;
+
+    localLatency
+        .name(name() + ".local_latency")
+        .desc("Avg. latency over all incoming pkts")
+        ;
+
+    localLatency = localqueuingCycles / uncoalescedAccesses;
+}
+
+
+TLBCoalescer*
+TLBCoalescerParams::create()
+{
+    return new TLBCoalescer(this);
+}
+
diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/gpu-compute/tlb_coalescer.hh
new file mode 100644
index 000000000..09210148b
--- /dev/null
+++ b/src/gpu-compute/tlb_coalescer.hh
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#ifndef __TLB_COALESCER_HH__
+#define __TLB_COALESCER_HH__
+
+#include <list>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "arch/generic/tlb.hh"
+#include "arch/isa.hh"
+#include "arch/isa_traits.hh"
+#include "arch/x86/pagetable.hh"
+#include "arch/x86/regs/segment.hh"
+#include "base/misc.hh"
+#include "base/statistics.hh"
+#include "gpu-compute/gpu_tlb.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/TLBCoalescer.hh"
+
+class BaseTLB;
+class Packet;
+class ThreadContext;
+
+/**
+ * The TLBCoalescer is a MemObject sitting on the front side (CPUSide) of
+ * each TLB. It receives packets and issues coalesced requests to the
+ * TLB below it. It controls how requests are coalesced (the rules)
+ * and the permitted number of TLB probes per cycle (i.e., how many
+ * coalesced requests it feeds the TLB per cycle).
+ */
+class TLBCoalescer : public MemObject
+{
+   protected:
+    // TLB clock: will inherit clock from shader's clock period in terms
+    // of nuber of ticks of curTime (aka global simulation clock)
+    // The assignment of TLB clock from shader clock is done in the
+    // python config files.
+    int clock;
+
+  public:
+    typedef TLBCoalescerParams Params;
+    TLBCoalescer(const Params *p);
+    ~TLBCoalescer() { }
+
+    // Number of TLB probes per cycle. Parameterizable - default 2.
+    int TLBProbesPerCycle;
+
+    // Consider coalescing across that many ticks.
+    // Paraemterizable - default 1.
+    int coalescingWindow;
+
+    // Each coalesced request consists of multiple packets
+    // that all fall within the same virtual page
+    typedef std::vector<PacketPtr> coalescedReq;
+
+    // disables coalescing when true
+    bool disableCoalescing;
+
+    /*
+     * This is a hash map with <tick_index> as a key.
+     * It contains a vector of coalescedReqs per <tick_index>.
+     * Requests are buffered here until they can be issued to
+     * the TLB, at which point they are copied to the
+     * issuedTranslationsTable hash map.
+     *
+     * In terms of coalescing, we coalesce requests in a given
+     * window of x cycles by using tick_index = issueTime/x as a
+     * key, where x = coalescingWindow. issueTime is the issueTime
+     * of the pkt from the ComputeUnit's perspective, but another
+     * option is to change it to curTick(), so we coalesce based
+     * on the receive time.
+     */
+    typedef std::unordered_map<int64_t, std::vector<coalescedReq>> CoalescingFIFO;
+
+    CoalescingFIFO coalescerFIFO;
+
+    /*
+     * issuedTranslationsTabler: a hash_map indexed by virtual page
+     * address. Each hash_map entry has a vector of PacketPtr associated
+     * with it denoting the different packets that share an outstanding
+     * coalesced translation request for the same virtual page.
+     *
+     * The rules that determine which requests we can coalesce are
+     * specified in the canCoalesce() method.
+     */
+    typedef std::unordered_map<Addr, coalescedReq> CoalescingTable;
+
+    CoalescingTable issuedTranslationsTable;
+
+    // number of packets the coalescer receives
+    Stats::Scalar uncoalescedAccesses;
+    // number packets the coalescer send to the TLB
+    Stats::Scalar coalescedAccesses;
+
+    // Number of cycles the coalesced requests spend waiting in
+    // coalescerFIFO. For each packet the coalescer receives we take into
+    // account the number of all uncoalesced requests this pkt "represents"
+    Stats::Scalar queuingCycles;
+
+    // On average how much time a request from the
+    // uncoalescedAccesses that reaches the TLB
+    // spends waiting?
+    Stats::Scalar localqueuingCycles;
+    // localqueuingCycles/uncoalescedAccesses
+    Stats::Formula localLatency;
+
+    bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2);
+    void updatePhysAddresses(PacketPtr pkt);
+    void regStats();
+
+    // Clock related functions. Maps to-and-from
+    // Simulation ticks and object clocks.
+    Tick frequency() const { return SimClock::Frequency / clock; }
+    Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
+    Tick curCycle() const { return curTick() / clock; }
+    Tick tickToCycles(Tick val) const { return val / clock;}
+
+    class CpuSidePort : public SlavePort
+    {
+      public:
+        CpuSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer,
+                    PortID _index)
+            : SlavePort(_name, tlb_coalescer), coalescer(tlb_coalescer),
+              index(_index) { }
+
+      protected:
+        TLBCoalescer *coalescer;
+        int index;
+
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+
+        virtual void
+        recvRespRetry()
+        {
+            fatal("recvRespRetry() is not implemented in the TLB coalescer.\n");
+        }
+
+        virtual AddrRangeList getAddrRanges() const;
+    };
+
+    class MemSidePort : public MasterPort
+    {
+      public:
+        MemSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer,
+                    PortID _index)
+            : MasterPort(_name, tlb_coalescer), coalescer(tlb_coalescer),
+              index(_index) { }
+
+        std::deque<PacketPtr> retries;
+
+      protected:
+        TLBCoalescer *coalescer;
+        int index;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+
+        virtual void
+        recvRespRetry()
+        {
+            fatal("recvRespRetry() not implemented in TLB coalescer");
+        }
+    };
+
+    // Coalescer slave ports on the cpu Side
+    std::vector<CpuSidePort*> cpuSidePort;
+    // Coalescer master ports on the memory side
+    std::vector<MemSidePort*> memSidePort;
+
+    BaseMasterPort& getMasterPort(const std::string &if_name, PortID idx);
+    BaseSlavePort& getSlavePort(const std::string &if_name, PortID idx);
+
+    class IssueProbeEvent : public Event
+    {
+      private:
+        TLBCoalescer *coalescer;
+
+      public:
+        IssueProbeEvent(TLBCoalescer *_coalescer);
+        void process();
+        const char *description() const;
+    };
+
+    // this event issues the TLB probes
+    IssueProbeEvent probeTLBEvent;
+
+    // the cleanupEvent is scheduled after a TLBEvent triggers
+    // in order to free memory and do the required clean-up
+    class CleanupEvent : public Event
+    {
+      private:
+        TLBCoalescer *coalescer;
+
+      public:
+        CleanupEvent(TLBCoalescer *_coalescer);
+        void process();
+        const char* description() const;
+     };
+
+    // schedule cleanup
+    CleanupEvent cleanupEvent;
+
+    // this FIFO queue keeps track of the virt. page
+    // addresses that are pending cleanup
+    std::queue<Addr> cleanupQueue;
+};
+
+#endif // __TLB_COALESCER_HH__
diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc
new file mode 100644
index 000000000..8b7dc0691
--- /dev/null
+++ b/src/gpu-compute/vector_register_file.cc
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/vector_register_file.hh"
+
+#include <string>
+
+#include "base/misc.hh"
+#include "gpu-compute/code_enums.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/simple_pool_manager.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/VectorRegisterFile.hh"
+
+VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p)
+    : SimObject(p),
+      manager(new SimplePoolManager(p->min_alloc, p->num_regs_per_simd)),
+      simdId(p->simd_id), numRegsPerSimd(p->num_regs_per_simd),
+      vgprState(new VecRegisterState())
+{
+    fatal_if(numRegsPerSimd % 2, "VRF size is illegal\n");
+    fatal_if(simdId < 0, "Illegal SIMD id for VRF");
+
+    fatal_if(numRegsPerSimd % p->min_alloc, "Min VGPR region allocation is not "
+             "multiple of VRF size\n");
+
+    busy.clear();
+    busy.resize(numRegsPerSimd, 0);
+    nxtBusy.clear();
+    nxtBusy.resize(numRegsPerSimd, 0);
+
+    vgprState->init(numRegsPerSimd);
+}
+
+void
+VectorRegisterFile::setParent(ComputeUnit *_computeUnit)
+{
+    computeUnit = _computeUnit;
+    vgprState->setParent(computeUnit);
+}
+
+uint8_t
+VectorRegisterFile::regNxtBusy(int idx, uint32_t operandSize) const
+{
+    uint8_t status = nxtBusy.at(idx);
+
+    if (operandSize > 4) {
+        status = status | (nxtBusy.at((idx + 1) % numRegs()));
+    }
+
+    return status;
+}
+
+uint8_t
+VectorRegisterFile::regBusy(int idx, uint32_t operandSize) const
+{
+    uint8_t status = busy.at(idx);
+
+    if (operandSize > 4) {
+        status = status | (busy.at((idx + 1) % numRegs()));
+    }
+
+    return status;
+}
+
+void
+VectorRegisterFile::preMarkReg(int regIdx, uint32_t operandSize, uint8_t value)
+{
+    nxtBusy.at(regIdx) = value;
+
+    if (operandSize > 4) {
+        nxtBusy.at((regIdx + 1) % numRegs()) = value;
+    }
+}
+
+void
+VectorRegisterFile::markReg(int regIdx, uint32_t operandSize, uint8_t value)
+{
+    busy.at(regIdx) = value;
+
+    if (operandSize > 4) {
+        busy.at((regIdx + 1) % numRegs()) = value;
+    }
+}
+
+bool
+VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
+{
+    for (int i = 0; i < ii->getNumOperands(); ++i) {
+        if (ii->isVectorRegister(i)) {
+            uint32_t vgprIdx = ii->getRegisterIndex(i);
+            uint32_t pVgpr = w->remap(vgprIdx, ii->getOperandSize(i), 1);
+
+            if (regBusy(pVgpr, ii->getOperandSize(i)) == 1) {
+                if (ii->isDstOperand(i)) {
+                    w->numTimesBlockedDueWAXDependencies++;
+                } else if (ii->isSrcOperand(i)) {
+                    w->numTimesBlockedDueRAWDependencies++;
+                }
+
+                return false;
+            }
+
+            if (regNxtBusy(pVgpr, ii->getOperandSize(i)) == 1) {
+                if (ii->isDstOperand(i)) {
+                    w->numTimesBlockedDueWAXDependencies++;
+                } else if (ii->isSrcOperand(i)) {
+                    w->numTimesBlockedDueRAWDependencies++;
+                }
+
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+void
+VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w)
+{
+    bool loadInstr = IS_OT_READ(ii->opType());
+    bool atomicInstr = IS_OT_ATOMIC(ii->opType());
+
+    bool loadNoArgInstr = loadInstr && !ii->isArgLoad();
+
+    // iterate over all register destination operands
+    for (int i = 0; i < ii->getNumOperands(); ++i) {
+        if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
+            uint32_t physReg = w->remap(ii->getRegisterIndex(i),
+                                        ii->getOperandSize(i), 1);
+
+            // mark the destination vector register as busy
+            markReg(physReg, ii->getOperandSize(i), 1);
+            // clear the in-flight status of the destination vector register
+            preMarkReg(physReg, ii->getOperandSize(i), 0);
+
+            // FIXME: if we ever model correct timing behavior
+            // for load argument instructions then we should not
+            // set the destination register as busy now but when
+            // the data returns. Loads and Atomics should free
+            // their destination registers when the data returns,
+            // not now
+            if (!atomicInstr && !loadNoArgInstr) {
+                uint32_t pipeLen = ii->getOperandSize(i) <= 4 ?
+                    computeUnit->spBypassLength() :
+                    computeUnit->dpBypassLength();
+
+                // schedule an event for marking the register as ready
+                computeUnit->registerEvent(w->simdId, physReg,
+                                           ii->getOperandSize(i),
+                                           computeUnit->shader->tick_cnt +
+                                           computeUnit->shader->ticks(pipeLen),
+                                           0);
+            }
+        }
+    }
+}
+
+int
+VectorRegisterFile::exec(uint64_t dynamic_id, Wavefront *w,
+                         std::vector<uint32_t> &regVec, uint32_t operandSize,
+                         uint64_t timestamp)
+{
+    int delay = 0;
+
+    panic_if(regVec.size() <= 0, "Illegal VGPR vector size=%d\n",
+             regVec.size());
+
+    for (int i = 0; i < regVec.size(); ++i) {
+        // mark the destination VGPR as free when the timestamp expires
+        computeUnit->registerEvent(w->simdId, regVec[i], operandSize,
+                                   computeUnit->shader->tick_cnt + timestamp +
+                                   computeUnit->shader->ticks(delay), 0);
+    }
+
+    return delay;
+}
+
+void
+VectorRegisterFile::updateResources(Wavefront *w, GPUDynInstPtr ii)
+{
+    // iterate over all register destination operands
+    for (int i = 0; i < ii->getNumOperands(); ++i) {
+        if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
+            uint32_t physReg = w->remap(ii->getRegisterIndex(i),
+                                        ii->getOperandSize(i), 1);
+            // set the in-flight status of the destination vector register
+            preMarkReg(physReg, ii->getOperandSize(i), 1);
+        }
+    }
+}
+
+bool
+VectorRegisterFile::vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
+                                          GPUDynInstPtr ii,
+                                          VrfAccessType accessType)
+{
+    bool ready = true;
+
+    return ready;
+}
+
+bool
+VectorRegisterFile::vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
+                                          VrfAccessType accessType)
+{
+    bool ready = true;
+
+    return ready;
+}
+
+VectorRegisterFile*
+VectorRegisterFileParams::create()
+{
+    return new VectorRegisterFile(this);
+}
diff --git a/src/gpu-compute/vector_register_file.hh b/src/gpu-compute/vector_register_file.hh
new file mode 100644
index 000000000..1cb011a1e
--- /dev/null
+++ b/src/gpu-compute/vector_register_file.hh
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __VECTOR_REGISTER_FILE_HH__
+#define __VECTOR_REGISTER_FILE_HH__
+
+#include <list>
+
+#include "base/statistics.hh"
+#include "base/types.hh"
+#include "gpu-compute/vector_register_state.hh"
+#include "sim/sim_object.hh"
+
+class ComputeUnit;
+class Shader;
+class SimplePoolManager;
+class Wavefront;
+
+struct VectorRegisterFileParams;
+
+enum class VrfAccessType : uint8_t
+{
+    READ = 0x01,
+    WRITE = 0x02,
+    RD_WR = READ | WRITE
+};
+
+// Vector Register File
+class VectorRegisterFile : public SimObject
+{
+  public:
+    VectorRegisterFile(const VectorRegisterFileParams *p);
+
+    void setParent(ComputeUnit *_computeUnit);
+
+    // Read a register
+    template<typename T>
+    T
+    read(int regIdx, int threadId=0)
+    {
+        T p0 = vgprState->read<T>(regIdx, threadId);
+
+        return p0;
+    }
+
+    // Write a register
+    template<typename T>
+    void
+    write(int regIdx, T value, int threadId=0)
+    {
+        vgprState->write<T>(regIdx, value, threadId);
+    }
+
+    uint8_t regBusy(int idx, uint32_t operandSize) const;
+    uint8_t regNxtBusy(int idx, uint32_t operandSize) const;
+
+    int numRegs() const { return numRegsPerSimd; }
+
+    void markReg(int regIdx, uint32_t operandSize, uint8_t value);
+    void preMarkReg(int regIdx, uint32_t operandSize, uint8_t value);
+
+    virtual void exec(GPUDynInstPtr ii, Wavefront *w);
+
+    virtual int exec(uint64_t dynamic_id, Wavefront *w,
+                     std::vector<uint32_t> &regVec, uint32_t operandSize,
+                     uint64_t timestamp);
+
+    bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const;
+    virtual void updateEvents() { }
+    virtual void updateResources(Wavefront *w, GPUDynInstPtr ii);
+
+    virtual bool
+    isReadConflict(int memWfId, int exeWfId) const
+    {
+        return false;
+    }
+
+    virtual bool
+    isWriteConflict(int memWfId, int exeWfId) const
+    {
+        return false;
+    }
+
+    virtual bool vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
+                                       GPUDynInstPtr ii,
+                                       VrfAccessType accessType);
+
+    virtual bool vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
+                                       VrfAccessType accessType);
+
+    SimplePoolManager *manager;
+
+  protected:
+    ComputeUnit* computeUnit;
+    int simdId;
+
+    // flag indicating if a register is busy
+    std::vector<uint8_t> busy;
+    // flag indicating if a register will be busy (by instructions
+    // in the SIMD pipeline)
+    std::vector<uint8_t> nxtBusy;
+
+    // numer of registers (bank size) per simd unit (bank)
+    int numRegsPerSimd;
+
+    // vector register state
+    VecRegisterState *vgprState;
+};
+
+#endif // __VECTOR_REGISTER_FILE_HH__
diff --git a/src/gpu-compute/vector_register_state.cc b/src/gpu-compute/vector_register_state.cc
new file mode 100644
index 000000000..f231b0579
--- /dev/null
+++ b/src/gpu-compute/vector_register_state.cc
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/vector_register_state.hh"
+
+#include "gpu-compute/compute_unit.hh"
+
+VecRegisterState::VecRegisterState() : computeUnit(nullptr)
+{
+    s_reg.clear();
+    d_reg.clear();
+}
+
+void
+VecRegisterState::setParent(ComputeUnit *_computeUnit)
+{
+    computeUnit = _computeUnit;
+    _name = computeUnit->name() + ".VecRegState";
+}
+
+void
+VecRegisterState::init(uint32_t _size)
+{
+    s_reg.resize(_size);
+    d_reg.resize(_size);
+}
diff --git a/src/gpu-compute/vector_register_state.hh b/src/gpu-compute/vector_register_state.hh
new file mode 100644
index 000000000..a233b9acc
--- /dev/null
+++ b/src/gpu-compute/vector_register_state.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __VECTOR_REGISTER_STATE_HH__
+#define __VECTOR_REGISTER_STATE_HH__
+
+#include <array>
+#include <cassert>
+#include <string>
+#include <vector>
+
+#include "gpu-compute/misc.hh"
+
+class ComputeUnit;
+
+// Vector Register State per SIMD unit (contents of the vector
+// registers in the VRF of the SIMD)
+class VecRegisterState
+{
+  public:
+    VecRegisterState();
+    void init(uint32_t _size);
+
+    const std::string& name() const { return _name; }
+    void setParent(ComputeUnit *_computeUnit);
+    void regStats() { }
+
+    // Access methods
+    template<typename T>
+    T
+    read(int regIdx, int threadId=0) {
+        T *p0;
+        assert(sizeof(T) == 4 || sizeof(T) == 8);
+        if (sizeof(T) == 4) {
+            p0 = (T*)(&s_reg[regIdx][threadId]);
+        } else {
+            p0 = (T*)(&d_reg[regIdx][threadId]);
+        }
+
+        return *p0;
+    }
+
+    template<typename T>
+    void
+    write(unsigned int regIdx, T value, int threadId=0) {
+        T *p0;
+        assert(sizeof(T) == 4 || sizeof(T) == 8);
+        if (sizeof(T) == 4) {
+            p0 = (T*)(&s_reg[regIdx][threadId]);
+        } else {
+            p0 = (T*)(&d_reg[regIdx][threadId]);
+        }
+
+        *p0 = value;
+    }
+
+    // (Single Precision) Vector Register File size.
+    int regSize() { return s_reg.size(); }
+
+  private:
+    ComputeUnit *computeUnit;
+    std::string _name;
+    // 32-bit Single Precision Vector Register State
+    std::vector<std::array<uint32_t, VSZ>> s_reg;
+    // 64-bit Double Precision Vector Register State
+    std::vector<std::array<uint64_t, VSZ>> d_reg;
+};
+
+#endif // __VECTOR_REGISTER_STATE_HH__
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
new file mode 100644
index 000000000..0aa033db1
--- /dev/null
+++ b/src/gpu-compute/wavefront.cc
@@ -0,0 +1,925 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#include "gpu-compute/wavefront.hh"
+
+#include "debug/GPUExec.hh"
+#include "debug/WavefrontStack.hh"
+#include "gpu-compute/code_enums.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+
+Wavefront*
+WavefrontParams::create()
+{
+    return new Wavefront(this);
+}
+
+Wavefront::Wavefront(const Params *p)
+  : SimObject(p), callArgMem(nullptr)
+{
+    last_trace = 0;
+    simdId = p->simdId;
+    wfSlotId = p->wf_slot_id;
+
+    status = S_STOPPED;
+    reservedVectorRegs = 0;
+    startVgprIndex = 0;
+    outstanding_reqs = 0;
+    mem_reqs_in_pipe = 0;
+    outstanding_reqs_wr_gm = 0;
+    outstanding_reqs_wr_lm = 0;
+    outstanding_reqs_rd_gm = 0;
+    outstanding_reqs_rd_lm = 0;
+    rd_lm_reqs_in_pipe = 0;
+    rd_gm_reqs_in_pipe = 0;
+    wr_lm_reqs_in_pipe = 0;
+    wr_gm_reqs_in_pipe = 0;
+
+    barrier_cnt = 0;
+    old_barrier_cnt = 0;
+    stalledAtBarrier = false;
+
+    mem_trace_busy = 0;
+    old_vgpr_tcnt = 0xffffffffffffffffll;
+    old_dgpr_tcnt = 0xffffffffffffffffll;
+
+    pendingFetch = false;
+    dropFetch = false;
+    condRegState = new ConditionRegisterState();
+    maxSpVgprs = 0;
+    maxDpVgprs = 0;
+}
+
+void
+Wavefront::regStats()
+{
+    srcRegOpDist
+        .init(0, 4, 2)
+        .name(name() + ".src_reg_operand_dist")
+        .desc("number of executed instructions with N source register operands")
+        ;
+
+    dstRegOpDist
+        .init(0, 3, 2)
+        .name(name() + ".dst_reg_operand_dist")
+        .desc("number of executed instructions with N destination register "
+              "operands")
+        ;
+
+    // FIXME: the name of the WF needs to be unique
+    numTimesBlockedDueWAXDependencies
+        .name(name() + ".timesBlockedDueWAXDependencies")
+        .desc("number of times the wf's instructions are blocked due to WAW "
+              "or WAR dependencies")
+        ;
+
+    // FIXME: the name of the WF needs to be unique
+    numTimesBlockedDueRAWDependencies
+        .name(name() + ".timesBlockedDueRAWDependencies")
+        .desc("number of times the wf's instructions are blocked due to RAW "
+              "dependencies")
+        ;
+
+    // FIXME: the name of the WF needs to be unique
+    numTimesBlockedDueVrfPortAvail
+        .name(name() + ".timesBlockedDueVrfPortAvail")
+        .desc("number of times instructions are blocked due to VRF port "
+              "availability")
+        ;
+}
+
+void
+Wavefront::init()
+{
+    reservedVectorRegs = 0;
+    startVgprIndex = 0;
+}
+
+void
+Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
+{
+    condRegState->init(num_cregs);
+    maxSpVgprs = num_sregs;
+    maxDpVgprs = num_dregs;
+}
+
+Wavefront::~Wavefront()
+{
+    if (callArgMem)
+        delete callArgMem;
+}
+
+void
+Wavefront::start(uint64_t _wfDynId,uint64_t _base_ptr)
+{
+    wfDynId = _wfDynId;
+    base_ptr = _base_ptr;
+    status = S_RUNNING;
+}
+
+bool
+Wavefront::isGmInstruction(GPUDynInstPtr ii)
+{
+    if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
+        IS_OT_ATOMIC_PM(ii->opType())) {
+        return true;
+    }
+
+    if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
+        IS_OT_ATOMIC_GM(ii->opType())) {
+
+        return true;
+    }
+
+    if (IS_OT_FLAT(ii->opType())) {
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isLmInstruction(GPUDynInstPtr ii)
+{
+    if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) ||
+        IS_OT_ATOMIC_LM(ii->opType())) {
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstALU()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP ||
+        ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
+        ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
+        ii->opType() == Enums::OT_KERN_READ)) {
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstBarrier()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) {
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstGMem()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) ||
+        IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
+
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstLMem()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) ||
+        IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
+
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstPrivMem()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) ||
+        IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
+
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstFlatMem()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) {
+
+        return true;
+    }
+
+    return false;
+}
+
+// Return true if the Wavefront's instruction
+// buffer has branch instruction.
+bool
+Wavefront::instructionBufferHasBranch()
+{
+    for (auto it : instructionBuffer) {
+        GPUDynInstPtr ii = it;
+
+        if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+// Remap HSAIL register to physical VGPR.
+// HSAIL register = virtual register assigned to an operand by HLC compiler
+uint32_t
+Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
+{
+    assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
+    // add the offset from where the VGPRs of the wavefront have been assigned
+    uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
+    // HSAIL double precision (DP) register: calculate the physical VGPR index
+    // assuming that DP registers are placed after SP ones in the VRF. The DP
+    // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
+    // the DP VGPR index before mapping it to the physical VRF address space
+    if (mode == 1 && size > 4) {
+        physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
+    }
+
+    assert((startVgprIndex <= physicalVgprIndex) &&
+           (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
+
+    // calculate absolute physical VGPR index
+    return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
+}
+
+// Return true if this wavefront is ready
+// to execute an instruction of the specified type.
+int
+Wavefront::ready(itype_e type)
+{
+    // Check to make sure wave is running
+    if (status == S_STOPPED || status == S_RETURNING ||
+        instructionBuffer.empty()) {
+        return 0;
+    }
+
+    // Is the wave waiting at a barrier
+    if (stalledAtBarrier) {
+        if (!computeUnit->AllAtBarrier(barrier_id,barrier_cnt,
+                        computeUnit->getRefCounter(dispatchid, wg_id))) {
+            // Are all threads at barrier?
+            return 0;
+        }
+        old_barrier_cnt = barrier_cnt;
+        stalledAtBarrier = false;
+    }
+
+    // Read instruction
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    bool ready_inst M5_VAR_USED = false;
+    bool glbMemBusRdy = false;
+    bool glbMemIssueRdy = false;
+    if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
+        for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
+            if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
+                glbMemBusRdy = true;
+            if (computeUnit->wfWait[j].prerdy())
+                glbMemIssueRdy = true;
+        }
+    }
+    bool locMemBusRdy = false;
+    bool locMemIssueRdy = false;
+    if (type == I_SHARED) {
+        for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
+            if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
+                locMemBusRdy = true;
+            if (computeUnit->wfWait[j].prerdy())
+                locMemIssueRdy = true;
+        }
+    }
+
+    // The following code is very error prone and the entire process for
+    // checking readiness will be fixed eventually.  In the meantime, let's
+    // make sure that we do not silently let an instruction type slip
+    // through this logic and always return not ready.
+    if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP ||
+          ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
+          ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
+          ii->opType() == Enums::OT_KERN_READ ||
+          ii->opType() == Enums::OT_ARG ||
+          IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
+          IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) ||
+          IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
+          IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
+          IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) {
+        panic("next instruction: %s is of unknown type\n", ii->disassemble());
+    }
+
+    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
+            computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
+
+    if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) {
+        // Here for ALU instruction (barrier)
+        if (!computeUnit->wfWait[simdId].prerdy()) {
+            // Is wave slot free?
+            return 0;
+        }
+
+        // Are there in pipe or outstanding memory requests?
+        if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
+            return 0;
+        }
+
+        ready_inst = true;
+    } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) {
+        // Here for ALU instruction (nop)
+        if (!computeUnit->wfWait[simdId].prerdy()) {
+            // Is wave slot free?
+            return 0;
+        }
+
+        ready_inst = true;
+    } else if (type == I_ALU && ii->opType() == Enums::OT_RET) {
+        // Here for ALU instruction (return)
+        if (!computeUnit->wfWait[simdId].prerdy()) {
+            // Is wave slot free?
+            return 0;
+        }
+
+        // Are there in pipe or outstanding memory requests?
+        if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
+            return 0;
+        }
+
+        ready_inst = true;
+    } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH ||
+               ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
+               ii->opType() == Enums::OT_KERN_READ ||
+               ii->opType() == Enums::OT_ARG)) {
+        // Here for ALU instruction (all others)
+        if (!computeUnit->wfWait[simdId].prerdy()) {
+            // Is alu slot free?
+            return 0;
+        }
+        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+                    VrfAccessType::RD_WR)) {
+            return 0;
+        }
+
+        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+            return 0;
+        }
+        ready_inst = true;
+    } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) ||
+               IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
+        // Here Global memory instruction
+        if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) {
+            // Are there in pipe or outstanding global memory write requests?
+            if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
+                return 0;
+            }
+        }
+
+        if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) ||
+            IS_OT_HIST_GM(ii->opType())) {
+            // Are there in pipe or outstanding global memory read requests?
+            if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0)
+                return 0;
+        }
+
+        if (!glbMemIssueRdy) {
+            // Is WV issue slot free?
+            return 0;
+        }
+
+        if (!glbMemBusRdy) {
+            // Is there an available VRF->Global memory read bus?
+            return 0;
+        }
+
+        if (!computeUnit->globalMemoryPipe.
+            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
+            // Can we insert a new request to the Global Mem Request FIFO?
+            return 0;
+        }
+        // can we schedule source & destination operands on the VRF?
+        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+                    VrfAccessType::RD_WR)) {
+            return 0;
+        }
+        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+            return 0;
+        }
+        ready_inst = true;
+    } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) ||
+               IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
+        // Here for Shared memory instruction
+        if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) {
+            if ((outstanding_reqs_wr_lm + wr_lm_reqs_in_pipe) > 0) {
+                return 0;
+            }
+        }
+
+        if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
+            IS_OT_HIST_LM(ii->opType())) {
+            if ((outstanding_reqs_rd_lm + rd_lm_reqs_in_pipe) > 0) {
+                return 0;
+            }
+        }
+
+        if (!locMemBusRdy) {
+            // Is there an available VRF->LDS read bus?
+            return 0;
+        }
+        if (!locMemIssueRdy) {
+            // Is wave slot free?
+            return 0;
+        }
+
+        if (!computeUnit->localMemoryPipe.
+            isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
+            // Can we insert a new request to the LDS Request FIFO?
+            return 0;
+        }
+        // can we schedule source & destination operands on the VRF?
+        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+                    VrfAccessType::RD_WR)) {
+            return 0;
+        }
+        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+            return 0;
+        }
+        ready_inst = true;
+    } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) ||
+               IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
+        // Here for Private memory instruction ------------------------    //
+        if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) {
+            if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
+                return 0;
+            }
+        }
+
+        if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) ||
+            IS_OT_HIST_PM(ii->opType())) {
+            if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) {
+                return 0;
+            }
+        }
+
+        if (!glbMemBusRdy) {
+            // Is there an available VRF->Global memory read bus?
+            return 0;
+        }
+
+        if (!glbMemIssueRdy) {
+             // Is wave slot free?
+            return 0;
+        }
+
+        if (!computeUnit->globalMemoryPipe.
+            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
+            // Can we insert a new request to the Global Mem Request FIFO?
+            return 0;
+        }
+        // can we schedule source & destination operands on the VRF?
+        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+                    VrfAccessType::RD_WR)) {
+            return 0;
+        }
+        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+            return 0;
+        }
+        ready_inst = true;
+    } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) {
+        if (!glbMemBusRdy) {
+            // Is there an available VRF->Global memory read bus?
+            return 0;
+        }
+
+        if (!locMemBusRdy) {
+            // Is there an available VRF->LDS read bus?
+            return 0;
+        }
+
+        if (!glbMemIssueRdy) {
+            // Is wave slot free?
+            return 0;
+        }
+
+        if (!locMemIssueRdy) {
+            return 0;
+        }
+        if (!computeUnit->globalMemoryPipe.
+            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
+            // Can we insert a new request to the Global Mem Request FIFO?
+            return 0;
+        }
+
+        if (!computeUnit->localMemoryPipe.
+            isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
+            // Can we insert a new request to the LDS Request FIFO?
+            return 0;
+        }
+        // can we schedule source & destination operands on the VRF?
+        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+                    VrfAccessType::RD_WR)) {
+            return 0;
+        }
+        // are all the operands ready? (RAW, WAW and WAR depedencies met?)
+        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+            return 0;
+        }
+        ready_inst = true;
+    } else {
+        return 0;
+    }
+
+    assert(ready_inst);
+
+    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
+            simdId, wfSlotId, ii->disassemble());
+
+    return 1;
+}
+
+void
+Wavefront::updateResources()
+{
+    // Get current instruction
+    GPUDynInstPtr ii = instructionBuffer.front();
+    assert(ii);
+    computeUnit->vrf[simdId]->updateResources(this, ii);
+    // Single precision ALU or Branch or Return or Special instruction
+    if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
+        ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
+        // FIXME: Kernel argument loads are currently treated as ALU operations
+        // since we don't send memory packets at execution. If we fix that then
+        // we should map them to one of the memory pipelines
+        ii->opType()==Enums::OT_KERN_READ ||
+        ii->opType()==Enums::OT_ARG ||
+        ii->opType()==Enums::OT_RET) {
+        computeUnit->aluPipe[simdId].preset(computeUnit->shader->
+                                            ticks(computeUnit->spBypassLength()));
+        // this is to enforce a fixed number of cycles per issue slot per SIMD
+        computeUnit->wfWait[simdId].preset(computeUnit->shader->
+                                           ticks(computeUnit->issuePeriod));
+    } else if (ii->opType() == Enums::OT_BARRIER) {
+        computeUnit->wfWait[simdId].preset(computeUnit->shader->
+                                           ticks(computeUnit->issuePeriod));
+    } else if (ii->opType() == Enums::OT_FLAT_READ) {
+        assert(Enums::SC_NONE != ii->executedAs());
+        mem_reqs_in_pipe++;
+        rd_gm_reqs_in_pipe++;
+        if ( Enums::SC_SHARED == ii->executedAs() ) {
+            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+                preset(computeUnit->shader->ticks(4));
+            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        } else {
+            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+                preset(computeUnit->shader->ticks(4));
+            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        }
+    } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
+        assert(Enums::SC_NONE != ii->executedAs());
+        mem_reqs_in_pipe++;
+        wr_gm_reqs_in_pipe++;
+        if (Enums::SC_SHARED == ii->executedAs()) {
+            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+                preset(computeUnit->shader->ticks(8));
+            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        } else {
+            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+                preset(computeUnit->shader->ticks(8));
+            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        }
+    } else if (IS_OT_READ_GM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        rd_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(4));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_WRITE_GM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_ATOMIC_GM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_gm_reqs_in_pipe++;
+        rd_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_READ_LM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        rd_lm_reqs_in_pipe++;
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            preset(computeUnit->shader->ticks(4));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_WRITE_LM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_lm_reqs_in_pipe++;
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_ATOMIC_LM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_lm_reqs_in_pipe++;
+        rd_lm_reqs_in_pipe++;
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_READ_PM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        rd_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(4));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_WRITE_PM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_ATOMIC_PM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_gm_reqs_in_pipe++;
+        rd_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    }
+}
+
+void
+Wavefront::exec()
+{
+    // ---- Exit if wavefront is inactive ----------------------------- //
+
+    if (status == S_STOPPED || status == S_RETURNING ||
+        instructionBuffer.empty()) {
+        return;
+    }
+
+    // Get current instruction
+
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    const uint32_t old_pc = pc();
+    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
+            "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
+            ii->disassemble(), old_pc);
+    ii->execute();
+    // access the VRF
+    computeUnit->vrf[simdId]->exec(ii, this);
+    srcRegOpDist.sample(ii->numSrcRegOperands());
+    dstRegOpDist.sample(ii->numDstRegOperands());
+    computeUnit->numInstrExecuted++;
+    computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
+                                     computeUnit->lastExecCycle[simdId]);
+    computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
+    if (pc() == old_pc) {
+        uint32_t new_pc = old_pc + 1;
+        // PC not modified by instruction, proceed to next or pop frame
+        pc(new_pc);
+        if (new_pc == rpc()) {
+            popFromReconvergenceStack();
+            discardFetch();
+        } else {
+            instructionBuffer.pop_front();
+        }
+    }
+
+    if (computeUnit->shader->hsail_mode==Shader::SIMT) {
+        const int num_active_lanes = execMask().count();
+        computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
+        computeUnit->numVecOpsExecuted += num_active_lanes;
+        if (isGmInstruction(ii)) {
+            computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
+        } else if (isLmInstruction(ii)) {
+            computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
+        }
+    }
+
+    // ---- Update Vector ALU pipeline and other resources ------------------ //
+    // Single precision ALU or Branch or Return or Special instruction
+    if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
+        ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
+        // FIXME: Kernel argument loads are currently treated as ALU operations
+        // since we don't send memory packets at execution. If we fix that then
+        // we should map them to one of the memory pipelines
+        ii->opType() == Enums::OT_KERN_READ ||
+        ii->opType() == Enums::OT_ARG ||
+        ii->opType() == Enums::OT_RET) {
+        computeUnit->aluPipe[simdId].set(computeUnit->shader->
+                                         ticks(computeUnit->spBypassLength()));
+
+        // this is to enforce a fixed number of cycles per issue slot per SIMD
+        computeUnit->wfWait[simdId].set(computeUnit->shader->
+                                        ticks(computeUnit->issuePeriod));
+    } else if (ii->opType() == Enums::OT_BARRIER) {
+        computeUnit->wfWait[simdId].set(computeUnit->shader->
+                                        ticks(computeUnit->issuePeriod));
+    } else if (ii->opType() == Enums::OT_FLAT_READ) {
+        assert(Enums::SC_NONE != ii->executedAs());
+
+        if (Enums::SC_SHARED == ii->executedAs()) {
+            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+                set(computeUnit->shader->ticks(4));
+            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        } else {
+            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+                set(computeUnit->shader->ticks(4));
+            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        }
+    } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
+        assert(Enums::SC_NONE != ii->executedAs());
+        if (Enums::SC_SHARED == ii->executedAs()) {
+            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+                set(computeUnit->shader->ticks(8));
+            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        } else {
+            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+                set(computeUnit->shader->ticks(8));
+            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        }
+    } else if (IS_OT_READ_GM(ii->opType())) {
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            set(computeUnit->shader->ticks(4));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_WRITE_GM(ii->opType())) {
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            set(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_ATOMIC_GM(ii->opType())) {
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            set(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_READ_LM(ii->opType())) {
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            set(computeUnit->shader->ticks(4));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_WRITE_LM(ii->opType())) {
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            set(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_ATOMIC_LM(ii->opType())) {
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            set(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    }
+}
+
+bool
+Wavefront::waitingAtBarrier(int lane)
+{
+    return bar_cnt[lane] < max_bar_cnt;
+}
+
+void
+Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
+                                    const VectorMask& mask)
+{
+    assert(mask.count());
+    reconvergenceStack.emplace(new ReconvergenceStackEntry(pc, rpc, mask));
+}
+
+void
+Wavefront::popFromReconvergenceStack()
+{
+    assert(!reconvergenceStack.empty());
+
+    DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
+            computeUnit->cu_id, simdId, wfSlotId, wfDynId,
+            execMask().to_string<char, std::string::traits_type,
+            std::string::allocator_type>().c_str(), pc());
+
+    reconvergenceStack.pop();
+
+    DPRINTF(WavefrontStack, "%3i %s\n", pc(),
+            execMask().to_string<char, std::string::traits_type,
+            std::string::allocator_type>().c_str());
+
+}
+
+void
+Wavefront::discardFetch()
+{
+    instructionBuffer.clear();
+    dropFetch |=pendingFetch;
+}
+
+uint32_t
+Wavefront::pc() const
+{
+    return reconvergenceStack.top()->pc;
+}
+
+uint32_t
+Wavefront::rpc() const
+{
+    return reconvergenceStack.top()->rpc;
+}
+
+VectorMask
+Wavefront::execMask() const
+{
+    return reconvergenceStack.top()->execMask;
+}
+
+bool
+Wavefront::execMask(int lane) const
+{
+    return reconvergenceStack.top()->execMask[lane];
+}
+
+
+void
+Wavefront::pc(uint32_t new_pc)
+{
+    reconvergenceStack.top()->pc = new_pc;
+}
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
new file mode 100644
index 000000000..0abab8e83
--- /dev/null
+++ b/src/gpu-compute/wavefront.hh
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#ifndef __WAVEFRONT_HH__
+#define __WAVEFRONT_HH__
+
+#include <cassert>
+#include <deque>
+#include <memory>
+#include <stack>
+#include <vector>
+
+#include "base/misc.hh"
+#include "base/types.hh"
+#include "gpu-compute/condition_register_state.hh"
+#include "gpu-compute/lds_state.hh"
+#include "gpu-compute/misc.hh"
+#include "params/Wavefront.hh"
+#include "sim/sim_object.hh"
+
+static const int MAX_NUM_INSTS_PER_WF = 12;
+
+/*
+ * Arguments for the hsail opcode call, are user defined and variable length.
+ * The hardware/finalizer can support arguments in hardware or use memory to
+ * pass arguments. For now, let's assume that an unlimited number of arguments
+ * are supported in hardware (the compiler inlines functions whenver it can
+ * anyways, so unless someone is interested in the implications of linking/
+ * library functions, I think this is a reasonable assumption given the typical
+ * size of an OpenCL kernel).
+ *
+ * Note that call args are different than kernel arguments:
+ *   * All work-items in a kernel refer the same set of kernel arguments
+ *   * Each work-item has it's on set of call args. So a call argument at
+ *     address 0x4 is different for work-item 0 and work-item 1.
+ *
+ * Ok, the table below shows an example of how we organize the call arguments in
+ * the CallArgMem class.
+ *
+ * int foo(int arg1, double arg2)
+ *  ___________________________________________________
+ * | 0: return.0 | 4: return.1 | ... | 252: return.63  |
+ * |---------------------------------------------------|
+ * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63    |
+ * |---------------------------------------------------|
+ * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63   |
+ *  ___________________________________________________
+ */
+class CallArgMem
+{
+  public:
+    // pointer to buffer for storing function arguments
+    uint8_t *mem;
+    // size of function args
+    int funcArgsSizePerItem;
+
+    template<typename CType>
+    int
+    getLaneOffset(int lane, int addr)
+    {
+        return addr * VSZ + sizeof(CType) * lane;
+    }
+
+    CallArgMem(int func_args_size_per_item)
+      : funcArgsSizePerItem(func_args_size_per_item)
+    {
+        mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ);
+    }
+
+    ~CallArgMem()
+    {
+        free(mem);
+    }
+
+    template<typename CType>
+    uint8_t*
+    getLaneAddr(int lane, int addr)
+    {
+        return mem + getLaneOffset<CType>(lane, addr);
+    }
+
+    template<typename CType>
+    void
+    setLaneAddr(int lane, int addr, CType val)
+    {
+        *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
+    }
+};
+
+/**
+ * A reconvergence stack entry conveys the necessary state to implement
+ * control flow divergence.
+ */
+class ReconvergenceStackEntry {
+
+  public:
+    ReconvergenceStackEntry(uint32_t new_pc, uint32_t new_rpc,
+                            VectorMask new_mask) : pc(new_pc), rpc(new_rpc),
+                            execMask(new_mask) {
+    }
+
+    /**
+     * PC of current instruction.
+     */
+    uint32_t pc;
+    /**
+     * PC of the immediate post-dominator instruction, i.e., the value of
+     * @a pc for the first instruction that will be executed by the wavefront
+     * when a reconvergence point is reached.
+     */
+    uint32_t rpc;
+    /**
+     * Execution mask.
+     */
+    VectorMask execMask;
+};
+
+class Wavefront : public SimObject
+{
+  public:
+    enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
+    enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
+
+    // Base pointer for array of instruction pointers
+    uint64_t base_ptr;
+
+    uint32_t old_barrier_cnt;
+    uint32_t barrier_cnt;
+    uint32_t barrier_id;
+    uint32_t barrier_slots;
+    status_e status;
+    // HW slot id where the WF is mapped to inside a SIMD unit
+    int wfSlotId;
+    int kern_id;
+    // SIMD unit where the WV has been scheduled
+    int simdId;
+    // pointer to parent CU
+    ComputeUnit *computeUnit;
+
+    std::deque<GPUDynInstPtr> instructionBuffer;
+
+    bool pendingFetch;
+    bool dropFetch;
+
+    // Condition Register State (for HSAIL simulations only)
+    class ConditionRegisterState *condRegState;
+    // number of single precision VGPRs required by WF
+    uint32_t maxSpVgprs;
+    // number of double precision VGPRs required by WF
+    uint32_t maxDpVgprs;
+    // map virtual to physical vector register
+    uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
+    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+    bool isGmInstruction(GPUDynInstPtr ii);
+    bool isLmInstruction(GPUDynInstPtr ii);
+    bool isOldestInstGMem();
+    bool isOldestInstLMem();
+    bool isOldestInstPrivMem();
+    bool isOldestInstFlatMem();
+    bool isOldestInstALU();
+    bool isOldestInstBarrier();
+    // used for passing spill address to DDInstGPU
+    uint64_t last_addr[VSZ];
+    uint32_t workitemid[3][VSZ];
+    uint32_t workitemFlatId[VSZ];
+    uint32_t workgroupid[3];
+    uint32_t workgroupsz[3];
+    uint32_t gridsz[3];
+    uint32_t wg_id;
+    uint32_t wg_sz;
+    uint32_t dynwaveid;
+    uint32_t maxdynwaveid;
+    uint32_t dispatchid;
+    // outstanding global+local memory requests
+    uint32_t outstanding_reqs;
+    // memory requests between scoreboard
+    // and execute stage not yet executed
+    uint32_t mem_reqs_in_pipe;
+    // outstanding global memory write requests
+    uint32_t outstanding_reqs_wr_gm;
+    // outstanding local memory write requests
+    uint32_t outstanding_reqs_wr_lm;
+    // outstanding global memory read requests
+    uint32_t outstanding_reqs_rd_gm;
+    // outstanding local memory read requests
+    uint32_t outstanding_reqs_rd_lm;
+    uint32_t rd_lm_reqs_in_pipe;
+    uint32_t rd_gm_reqs_in_pipe;
+    uint32_t wr_lm_reqs_in_pipe;
+    uint32_t wr_gm_reqs_in_pipe;
+
+    int mem_trace_busy;
+    uint64_t last_trace;
+    // number of vector registers reserved by WF
+    int reservedVectorRegs;
+    // Index into the Vector Register File's namespace where the WF's registers
+    // will live while the WF is executed
+    uint32_t startVgprIndex;
+
+    // Old value of destination gpr (for trace)
+    uint32_t old_vgpr[VSZ];
+    // Id of destination gpr (for trace)
+    uint32_t old_vgpr_id;
+    // Tick count of last old_vgpr copy
+    uint64_t old_vgpr_tcnt;
+
+    // Old value of destination gpr (for trace)
+    uint64_t old_dgpr[VSZ];
+    // Id of destination gpr (for trace)
+    uint32_t old_dgpr_id;
+    // Tick count of last old_vgpr copy
+    uint64_t old_dgpr_tcnt;
+
+    // Execution mask at wavefront start
+    VectorMask init_mask;
+
+    // number of barriers this WF has joined
+    int bar_cnt[VSZ];
+    int max_bar_cnt;
+    // Flag to stall a wave on barrier
+    bool stalledAtBarrier;
+
+    // a pointer to the fraction of the LDS allocated
+    // to this workgroup (thus this wavefront)
+    LdsChunk *ldsChunk;
+
+    // A pointer to the spill area
+    Addr spillBase;
+    // The size of the spill area
+    uint32_t spillSizePerItem;
+    // The vector width of the spill area
+    uint32_t spillWidth;
+
+    // A pointer to the private memory area
+    Addr privBase;
+    // The size of the private memory area
+    uint32_t privSizePerItem;
+
+    // A pointer ot the read-only memory area
+    Addr roBase;
+    // size of the read-only memory area
+    uint32_t roSize;
+
+    // pointer to buffer for storing kernel arguments
+    uint8_t *kernelArgs;
+    // unique WF id over all WFs executed across all CUs
+    uint64_t wfDynId;
+
+    // number of times instruction issue for this wavefront is blocked
+    // due to VRF port availability
+    Stats::Scalar numTimesBlockedDueVrfPortAvail;
+    // number of times an instruction of a WF is blocked from being issued
+    // due to WAR and WAW dependencies
+    Stats::Scalar numTimesBlockedDueWAXDependencies;
+    // number of times an instruction of a WF is blocked from being issued
+    // due to WAR and WAW dependencies
+    Stats::Scalar numTimesBlockedDueRAWDependencies;
+    // distribution of executed instructions based on their register
+    // operands; this is used to highlight the load on the VRF
+    Stats::Distribution srcRegOpDist;
+    Stats::Distribution dstRegOpDist;
+
+    // Functions to operate on call argument memory
+    // argument memory for hsail call instruction
+    CallArgMem *callArgMem;
+    void
+    initCallArgMem(int func_args_size_per_item)
+    {
+        callArgMem = new CallArgMem(func_args_size_per_item);
+    }
+
+    template<typename CType>
+    CType
+    readCallArgMem(int lane, int addr)
+    {
+        return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
+    }
+
+    template<typename CType>
+    void
+    writeCallArgMem(int lane, int addr, CType val)
+    {
+        callArgMem->setLaneAddr<CType>(lane, addr, val);
+    }
+
+    typedef WavefrontParams Params;
+    Wavefront(const Params *p);
+    ~Wavefront();
+    virtual void init();
+
+    void
+    setParent(ComputeUnit *cu)
+    {
+        computeUnit = cu;
+    }
+
+    void start(uint64_t _wfDynId, uint64_t _base_ptr);
+
+    void exec();
+    void updateResources();
+    int ready(itype_e type);
+    bool instructionBufferHasBranch();
+    void regStats();
+    VectorMask get_pred() { return execMask() & init_mask; }
+
+    bool waitingAtBarrier(int lane);
+
+    void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
+                                  const VectorMask& exec_mask);
+
+    void popFromReconvergenceStack();
+
+    uint32_t pc() const;
+
+    uint32_t rpc() const;
+
+    VectorMask execMask() const;
+
+    bool execMask(int lane) const;
+
+    void pc(uint32_t new_pc);
+
+    void discardFetch();
+
+  private:
+    /**
+     * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
+     * to be visited by the wavefront, and the associated execution masks. The
+     * reconvergence stack grows every time the wavefront reaches a divergence
+     * point (branch instruction), and shrinks every time the wavefront
+     * reaches a reconvergence point (immediate post-dominator instruction).
+     */
+    std::stack<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
+};
+
+#endif // __WAVEFRONT_HH__