summaryrefslogtreecommitdiff
path: root/src/gpu-compute
diff options
context:
space:
mode:
authorTony Gutierrez <anthony.gutierrez@amd.com>2016-01-19 14:28:22 -0500
committerTony Gutierrez <anthony.gutierrez@amd.com>2016-01-19 14:28:22 -0500
commit1a7d3f9fcb76a68540dd948f91413533a383bfde (patch)
tree867510a147cd095f19499d26b7c02d27de4cae9d /src/gpu-compute
parent28e353e0403ea379d244a418e8dc8ee0b48187cf (diff)
downloadgem5-1a7d3f9fcb76a68540dd948f91413533a383bfde.tar.xz
gpu-compute: AMD's baseline GPU model
Diffstat (limited to 'src/gpu-compute')
-rw-r--r--src/gpu-compute/GPU.py310
-rw-r--r--src/gpu-compute/LdsState.py51
-rw-r--r--src/gpu-compute/SConscript99
-rw-r--r--src/gpu-compute/X86GPUTLB.py77
-rw-r--r--src/gpu-compute/brig_object.cc474
-rw-r--r--src/gpu-compute/brig_object.hh134
-rw-r--r--src/gpu-compute/cl_driver.cc272
-rw-r--r--src/gpu-compute/cl_driver.hh77
-rw-r--r--src/gpu-compute/cl_event.hh51
-rw-r--r--src/gpu-compute/code_enums.hh116
-rw-r--r--src/gpu-compute/compute_unit.cc1817
-rw-r--r--src/gpu-compute/compute_unit.hh767
-rw-r--r--src/gpu-compute/condition_register_state.cc83
-rw-r--r--src/gpu-compute/condition_register_state.hh101
-rw-r--r--src/gpu-compute/dispatcher.cc394
-rw-r--r--src/gpu-compute/dispatcher.hh163
-rw-r--r--src/gpu-compute/exec_stage.cc203
-rw-r--r--src/gpu-compute/exec_stage.hh129
-rw-r--r--src/gpu-compute/fetch_stage.cc106
-rw-r--r--src/gpu-compute/fetch_stage.hh78
-rw-r--r--src/gpu-compute/fetch_unit.cc293
-rw-r--r--src/gpu-compute/fetch_unit.hh89
-rw-r--r--src/gpu-compute/global_memory_pipeline.cc242
-rw-r--r--src/gpu-compute/global_memory_pipeline.hh123
-rw-r--r--src/gpu-compute/gpu_dyn_inst.cc198
-rw-r--r--src/gpu-compute/gpu_dyn_inst.hh464
-rw-r--r--src/gpu-compute/gpu_exec_context.cc53
-rw-r--r--src/gpu-compute/gpu_exec_context.hh54
-rw-r--r--src/gpu-compute/gpu_static_inst.cc42
-rw-r--r--src/gpu-compute/gpu_static_inst.hh166
-rw-r--r--src/gpu-compute/gpu_tlb.cc1801
-rw-r--r--src/gpu-compute/gpu_tlb.hh465
-rw-r--r--src/gpu-compute/hsa_code.hh101
-rw-r--r--src/gpu-compute/hsa_kernel_info.hh79
-rw-r--r--src/gpu-compute/hsa_object.cc76
-rw-r--r--src/gpu-compute/hsa_object.hh74
-rw-r--r--src/gpu-compute/hsail_code.cc453
-rw-r--r--src/gpu-compute/hsail_code.hh447
-rw-r--r--src/gpu-compute/kernel_cfg.cc296
-rw-r--r--src/gpu-compute/kernel_cfg.hh133
-rw-r--r--src/gpu-compute/lds_state.cc341
-rw-r--r--src/gpu-compute/lds_state.hh512
-rw-r--r--src/gpu-compute/local_memory_pipeline.cc200
-rw-r--r--src/gpu-compute/local_memory_pipeline.hh98
-rw-r--r--src/gpu-compute/misc.hh162
-rw-r--r--src/gpu-compute/ndrange.hh70
-rw-r--r--src/gpu-compute/of_scheduling_policy.cc76
-rw-r--r--src/gpu-compute/of_scheduling_policy.hh61
-rw-r--r--src/gpu-compute/pool_manager.cc42
-rw-r--r--src/gpu-compute/pool_manager.hh66
-rw-r--r--src/gpu-compute/qstruct.hh201
-rw-r--r--src/gpu-compute/rr_scheduling_policy.cc67
-rw-r--r--src/gpu-compute/rr_scheduling_policy.hh65
-rw-r--r--src/gpu-compute/schedule_stage.cc151
-rw-r--r--src/gpu-compute/schedule_stage.hh95
-rw-r--r--src/gpu-compute/scheduler.cc71
-rw-r--r--src/gpu-compute/scheduler.hh63
-rw-r--r--src/gpu-compute/scheduling_policy.hh57
-rw-r--r--src/gpu-compute/scoreboard_check_stage.cc173
-rw-r--r--src/gpu-compute/scoreboard_check_stage.hh106
-rw-r--r--src/gpu-compute/shader.cc412
-rw-r--r--src/gpu-compute/shader.hh212
-rw-r--r--src/gpu-compute/simple_pool_manager.cc108
-rw-r--r--src/gpu-compute/simple_pool_manager.hh72
-rw-r--r--src/gpu-compute/tlb_coalescer.cc583
-rw-r--r--src/gpu-compute/tlb_coalescer.hh252
-rw-r--r--src/gpu-compute/vector_register_file.cc251
-rw-r--r--src/gpu-compute/vector_register_file.hh142
-rw-r--r--src/gpu-compute/vector_register_state.cc58
-rw-r--r--src/gpu-compute/vector_register_state.hh101
-rw-r--r--src/gpu-compute/wavefront.cc925
-rw-r--r--src/gpu-compute/wavefront.hh368
72 files changed, 17312 insertions, 0 deletions
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
new file mode 100644
index 000000000..bd95f6335
--- /dev/null
+++ b/src/gpu-compute/GPU.py
@@ -0,0 +1,310 @@
+#
+# Copyright (c) 2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Steve Reinhardt
+#
+
+from ClockedObject import ClockedObject
+from Device import DmaDevice
+from m5.defines import buildEnv
+from m5.params import *
+from m5.proxy import *
+from m5.SimObject import SimObject
+from MemObject import MemObject
+from Process import EmulatedDriver
+from Bridge import Bridge
+from LdsState import LdsState
+
+class PrefetchType(Enum): vals = [
+ 'PF_CU',
+ 'PF_PHASE',
+ 'PF_WF',
+ 'PF_STRIDE',
+ 'PF_END',
+ ]
+
+class VectorRegisterFile(SimObject):
+ type = 'VectorRegisterFile'
+ cxx_class = 'VectorRegisterFile'
+ cxx_header = 'gpu-compute/vector_register_file.hh'
+
+ simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
+ num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
+ min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
+
+class Wavefront(SimObject):
+ type = 'Wavefront'
+ cxx_class = 'Wavefront'
+ cxx_header = 'gpu-compute/wavefront.hh'
+
+ simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
+ wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
+
+class ComputeUnit(MemObject):
+ type = 'ComputeUnit'
+ cxx_class = 'ComputeUnit'
+ cxx_header = 'gpu-compute/compute_unit.hh'
+
+ wavefronts = VectorParam.Wavefront('Number of wavefronts')
+ wfSize = Param.Int(64, 'Wavefront size (in work items)')
+ num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
+
+ spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
+ 'latency')
+
+ dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\
+ 'latency')
+
+ issue_period = Param.Int(4, 'number of cycles per issue period')
+ num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
+ num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
+ n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
+ mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\
+ "Represents the pipeline to reach the TCP and "\
+ "specified in GPU clock cycles")
+ mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\
+ "cu. Represents the pipeline between the TCP "\
+ "and cu as well as TCP data array access. "\
+ "Specified in GPU clock cycles")
+ system = Param.System(Parent.any, "system object")
+ cu_id = Param.Int('CU id')
+ vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\
+ "in bytes")
+ coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\
+ "in bytes")
+
+ memory_port = VectorMasterPort("Port to the memory system")
+ translation_port = VectorMasterPort('Port to the TLB hierarchy')
+ sqc_port = MasterPort("Port to the SQC (I-cache")
+ sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
+ perLaneTLB = Param.Bool(False, "enable per-lane TLB")
+ prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
+ "(0 turns off prefetching)")
+ prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)")
+ prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\
+ "from last mem req in lane of "\
+ "CU|Phase|Wavefront")
+ execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
+ xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr.");
+ debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
+ functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
+
+ localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
+ "kernel end")
+
+ countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\
+ "and how many times")
+ global_mem_queue_size = Param.Int(256, "Number of entries in the global "
+ "memory pipeline's queues")
+ local_mem_queue_size = Param.Int(256, "Number of entries in the local "
+ "memory pipeline's queues")
+ ldsBus = Bridge() # the bridge between the CU and its LDS
+ ldsPort = MasterPort("The port that goes to the LDS")
+ localDataStore = Param.LdsState("the LDS for this CU")
+
+ vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
+ "file")
+
+class Shader(ClockedObject):
+ type = 'Shader'
+ cxx_class = 'Shader'
+ cxx_header = 'gpu-compute/shader.hh'
+
+ CUs = VectorParam.ComputeUnit('Number of compute units')
+ n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
+ impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
+ ruby at kernel boundaries""")
+ separate_acquire_release = Param.Bool(False,
+ """Do ld_acquire/st_release generate separate requests for the
+ acquire and release?""")
+ globalmem = Param.MemorySize('64kB', 'Memory size')
+ timing = Param.Bool(False, 'timing memory accesses')
+
+ cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
+ translation = Param.Bool(False, "address translation");
+
+class ClDriver(EmulatedDriver):
+ type = 'ClDriver'
+ cxx_header = 'gpu-compute/cl_driver.hh'
+ codefile = VectorParam.String('code file name(s)')
+
+class GpuDispatcher(DmaDevice):
+ type = 'GpuDispatcher'
+ cxx_header = 'gpu-compute/dispatcher.hh'
+ # put at 8GB line for now
+ pio_addr = Param.Addr(0x200000000, "Device Address")
+ pio_latency = Param.Latency('1ns', "Programmed IO latency")
+ shader_pointer = Param.Shader('pointer to shader')
+ translation_port = MasterPort('Port to the dispatcher TLB')
+ cpu = Param.BaseCPU("CPU to wake up on kernel completion")
+
+ cl_driver = Param.ClDriver('pointer to driver')
+
+class OpType(Enum): vals = [
+ 'OT_NULL',
+ 'OT_ALU',
+ 'OT_SPECIAL',
+ 'OT_GLOBAL_READ',
+ 'OT_GLOBAL_WRITE',
+ 'OT_GLOBAL_ATOMIC',
+ 'OT_GLOBAL_HIST',
+ 'OT_GLOBAL_LDAS',
+ 'OT_SHARED_READ',
+ 'OT_SHARED_WRITE',
+ 'OT_SHARED_ATOMIC',
+ 'OT_SHARED_HIST',
+ 'OT_SHARED_LDAS',
+ 'OT_PRIVATE_READ',
+ 'OT_PRIVATE_WRITE',
+ 'OT_PRIVATE_ATOMIC',
+ 'OT_PRIVATE_HIST',
+ 'OT_PRIVATE_LDAS',
+ 'OT_SPILL_READ',
+ 'OT_SPILL_WRITE',
+ 'OT_SPILL_ATOMIC',
+ 'OT_SPILL_HIST',
+ 'OT_SPILL_LDAS',
+ 'OT_READONLY_READ',
+ 'OT_READONLY_WRITE',
+ 'OT_READONLY_ATOMIC',
+ 'OT_READONLY_HIST',
+ 'OT_READONLY_LDAS',
+ 'OT_FLAT_READ',
+ 'OT_FLAT_WRITE',
+ 'OT_FLAT_ATOMIC',
+ 'OT_FLAT_HIST',
+ 'OT_FLAT_LDAS',
+ 'OT_KERN_READ',
+ 'OT_BRANCH',
+
+ # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version
+ # of the compiler.
+ 'OT_SHARED_MEMFENCE',
+ 'OT_GLOBAL_MEMFENCE',
+ 'OT_BOTH_MEMFENCE',
+
+ 'OT_BARRIER',
+ 'OT_PRINT',
+ 'OT_RET',
+ 'OT_NOP',
+ 'OT_ARG'
+ ]
+
+class MemType(Enum): vals = [
+ 'M_U8',
+ 'M_U16',
+ 'M_U32',
+ 'M_U64',
+ 'M_S8',
+ 'M_S16',
+ 'M_S32',
+ 'M_S64',
+ 'M_F16',
+ 'M_F32',
+ 'M_F64',
+ ]
+
+class MemOpType(Enum): vals = [
+ 'MO_LD',
+ 'MO_ST',
+ 'MO_LDAS',
+ 'MO_LDA',
+ 'MO_AAND',
+ 'MO_AOR',
+ 'MO_AXOR',
+ 'MO_ACAS',
+ 'MO_AEXCH',
+ 'MO_AADD',
+ 'MO_ASUB',
+ 'MO_AINC',
+ 'MO_ADEC',
+ 'MO_AMAX',
+ 'MO_AMIN',
+ 'MO_ANRAND',
+ 'MO_ANROR',
+ 'MO_ANRXOR',
+ 'MO_ANRCAS',
+ 'MO_ANREXCH',
+ 'MO_ANRADD',
+ 'MO_ANRSUB',
+ 'MO_ANRINC',
+ 'MO_ANRDEC',
+ 'MO_ANRMAX',
+ 'MO_ANRMIN',
+ 'MO_HAND',
+ 'MO_HOR',
+ 'MO_HXOR',
+ 'MO_HCAS',
+ 'MO_HEXCH',
+ 'MO_HADD',
+ 'MO_HSUB',
+ 'MO_HINC',
+ 'MO_HDEC',
+ 'MO_HMAX',
+ 'MO_HMIN',
+ 'MO_UNDEF'
+ ]
+
+class StorageClassType(Enum): vals = [
+ 'SC_SPILL',
+ 'SC_GLOBAL',
+ 'SC_SHARED',
+ 'SC_PRIVATE',
+ 'SC_READONLY',
+ 'SC_KERNARG',
+ 'SC_NONE',
+ ]
+
+class RegisterType(Enum): vals = [
+ 'RT_VECTOR',
+ 'RT_SCALAR',
+ 'RT_CONDITION',
+ 'RT_HARDWARE',
+ 'RT_NONE',
+ ]
+
+class GenericMemoryOrder(Enum): vals = [
+ 'MEMORY_ORDER_NONE',
+ 'MEMORY_ORDER_RELAXED',
+ 'MEMORY_ORDER_SC_ACQUIRE',
+ 'MEMORY_ORDER_SC_RELEASE',
+ 'MEMORY_ORDER_SC_ACQUIRE_RELEASE',
+ ]
+
+class GenericMemoryScope(Enum): vals = [
+ 'MEMORY_SCOPE_NONE',
+ 'MEMORY_SCOPE_WORKITEM',
+ 'MEMORY_SCOPE_WAVEFRONT',
+ 'MEMORY_SCOPE_WORKGROUP',
+ 'MEMORY_SCOPE_DEVICE',
+ 'MEMORY_SCOPE_SYSTEM',
+ ]
diff --git a/src/gpu-compute/LdsState.py b/src/gpu-compute/LdsState.py
new file mode 100644
index 000000000..6ea9f6427
--- /dev/null
+++ b/src/gpu-compute/LdsState.py
@@ -0,0 +1,51 @@
+#
+# Copyright (c) 2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Joe Gross
+#
+
+from m5.defines import buildEnv
+from m5.params import *
+from m5.proxy import *
+
+from MemObject import MemObject
+
+class LdsState(MemObject):
+ type = 'LdsState'
+ cxx_class = 'LdsState'
+ cxx_header = 'gpu-compute/lds_state.hh'
+ size = Param.Int(65536, 'the size of the LDS')
+ range = Param.AddrRange('64kB', "address space of the LDS")
+ bankConflictPenalty = Param.Int(1, 'penalty per LDS bank conflict when '\
+ 'accessing data')
+ banks = Param.Int(32, 'Number of LDS banks')
+ cuPort = SlavePort("port that goes to the compute unit")
diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript
new file mode 100644
index 000000000..2de96df24
--- /dev/null
+++ b/src/gpu-compute/SConscript
@@ -0,0 +1,99 @@
+# -*- mode:python -*-
+
+#
+# Copyright (c) 2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Anthony Gutierrez
+#
+
+Import('*')
+
+if not env['BUILD_GPU']:
+ Return()
+
+SimObject('GPU.py')
+SimObject('LdsState.py')
+SimObject('X86GPUTLB.py')
+
+if env['TARGET_GPU_ISA'] == 'hsail':
+ Source('brig_object.cc')
+ Source('hsail_code.cc')
+
+Source('cl_driver.cc')
+Source('compute_unit.cc')
+Source('condition_register_state.cc')
+Source('dispatcher.cc')
+Source('exec_stage.cc')
+Source('fetch_stage.cc')
+Source('fetch_unit.cc')
+Source('global_memory_pipeline.cc')
+Source('gpu_dyn_inst.cc')
+Source('gpu_exec_context.cc')
+Source('gpu_static_inst.cc')
+Source('gpu_tlb.cc')
+Source('hsa_object.cc')
+Source('kernel_cfg.cc')
+Source('lds_state.cc')
+Source('local_memory_pipeline.cc')
+Source('of_scheduling_policy.cc')
+Source('pool_manager.cc')
+Source('rr_scheduling_policy.cc')
+Source('schedule_stage.cc')
+Source('scheduler.cc')
+Source('scoreboard_check_stage.cc')
+Source('shader.cc')
+Source('simple_pool_manager.cc')
+Source('tlb_coalescer.cc')
+Source('vector_register_file.cc')
+Source('vector_register_state.cc')
+Source('wavefront.cc')
+
+DebugFlag('BRIG')
+DebugFlag('GPUCoalescer')
+DebugFlag('GPUDisp')
+DebugFlag('GPUExec')
+DebugFlag('GPUFetch')
+DebugFlag('GPUHsailCFInfo')
+DebugFlag('GPUMem')
+DebugFlag('GPUPort')
+DebugFlag('GPUPrefetch')
+DebugFlag('GPUReg')
+DebugFlag('GPUSync')
+DebugFlag('GPUTLB')
+DebugFlag('HSALoader')
+DebugFlag('HSAIL')
+DebugFlag('HSAILObject')
+DebugFlag('Predictor')
+DebugFlag('WavefrontStack')
+
+CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
+ 'GPUMem', 'GPUPort', 'GPUSync', 'GPUTLB', 'HSAIL'])
diff --git a/src/gpu-compute/X86GPUTLB.py b/src/gpu-compute/X86GPUTLB.py
new file mode 100644
index 000000000..51f8e514e
--- /dev/null
+++ b/src/gpu-compute/X86GPUTLB.py
@@ -0,0 +1,77 @@
+#
+# Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Lisa Hsu
+#
+
+from m5.defines import buildEnv
+from m5.params import *
+from m5.proxy import *
+
+from m5.objects.MemObject import MemObject
+
+if buildEnv['FULL_SYSTEM']:
+ class X86PagetableWalker(MemObject):
+ type = 'X86PagetableWalker'
+ cxx_class = 'X86ISA::Walker'
+ port = SlavePort("Port for the hardware table walker")
+ system = Param.System(Parent.any, "system object")
+
+class X86GPUTLB(MemObject):
+ type = 'X86GPUTLB'
+ cxx_class = 'X86ISA::GpuTLB'
+ cxx_header = 'gpu-compute/gpu_tlb.hh'
+ size = Param.Int(64, "TLB size (number of entries)")
+ assoc = Param.Int(64, "TLB associativity")
+
+ if buildEnv['FULL_SYSTEM']:
+ walker = Param.X86PagetableWalker(X86PagetableWalker(),
+ "page table walker")
+
+ hitLatency = Param.Int(2, "Latency of a TLB hit")
+ missLatency1 = Param.Int(5, "Latency #1 of a TLB miss")
+ missLatency2 = Param.Int(100, "Latency #2 of a TLB miss")
+ maxOutstandingReqs = Param.Int(64, "# of maximum outstanding requests")
+ slave = VectorSlavePort("Port on side closer to CPU/CU")
+ master = VectorMasterPort("Port on side closer to memory")
+ allocationPolicy = Param.Bool(True, "Allocate on an access")
+ accessDistance = Param.Bool(False, "print accessDistance stats")
+
+class TLBCoalescer(MemObject):
+ type = 'TLBCoalescer'
+ cxx_class = 'TLBCoalescer'
+ cxx_header = 'gpu-compute/tlb_coalescer.hh'
+ probesPerCycle = Param.Int(2, "Number of TLB probes per cycle")
+ coalescingWindow = Param.Int(1, "Permit coalescing across that many ticks")
+ slave = VectorSlavePort("Port on side closer to CPU/CU")
+ master = VectorMasterPort("Port on side closer to memory")
+ disableCoalescing = Param.Bool(False,"Dispable Coalescing")
diff --git a/src/gpu-compute/brig_object.cc b/src/gpu-compute/brig_object.cc
new file mode 100644
index 000000000..7cc9b7cc4
--- /dev/null
+++ b/src/gpu-compute/brig_object.cc
@@ -0,0 +1,474 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt, Anthony Gutierrez
+ */
+
+#include "gpu-compute/brig_object.hh"
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+
+#include "arch/hsail/Brig.h"
+#include "base/misc.hh"
+#include "base/trace.hh"
+#include "debug/BRIG.hh"
+#include "debug/HSAILObject.hh"
+#include "debug/HSALoader.hh"
+
+using namespace Brig;
+
+std::vector<std::function<HsaObject*(const std::string&, int, uint8_t*)>>
+ HsaObject::tryFileFuncs = { BrigObject::tryFile };
+
+extern int getBrigDataTypeBytes(BrigType16_t t);
+
+const char *BrigObject::sectionNames[] =
+{
+ "hsa_data",
+ "hsa_code",
+ "hsa_operand",
+ ".shstrtab"
+};
+
+const char *segmentNames[] =
+{
+ "none",
+ "flat",
+ "global",
+ "readonly",
+ "kernarg",
+ "group",
+ "private",
+ "spill",
+ "args"
+};
+
+const uint8_t*
+BrigObject::getSectionOffset(enum SectionIndex sec, int offs) const
+{
+ // allow offs == size for dummy end pointers
+ assert(offs <= sectionInfo[sec].size);
+
+ return sectionInfo[sec].ptr + offs;
+}
+
+const char*
+BrigObject::getString(int offs) const
+{
+ return (const char*)(getSectionOffset(DataSectionIndex, offs) + 4);
+}
+
+const BrigBase*
+BrigObject::getCodeSectionEntry(int offs) const
+{
+ return (const BrigBase*)getSectionOffset(CodeSectionIndex, offs);
+}
+
+const BrigData*
+BrigObject::getBrigBaseData(int offs) const
+{
+ return (Brig::BrigData*)(getSectionOffset(DataSectionIndex, offs));
+}
+
+const uint8_t*
+BrigObject::getData(int offs) const
+{
+ return getSectionOffset(DataSectionIndex, offs);
+}
+
+const BrigOperand*
+BrigObject::getOperand(int offs) const
+{
+ return (const BrigOperand*)getSectionOffset(OperandsSectionIndex, offs);
+}
+
+unsigned
+BrigObject::getOperandPtr(int offs, int index) const
+{
+ unsigned *op_offs = (unsigned*)(getData(offs + 4 * (index + 1)));
+
+ return *op_offs;
+}
+
+const BrigInstBase*
+BrigObject::getInst(int offs) const
+{
+ return (const BrigInstBase*)getSectionOffset(CodeSectionIndex, offs);
+}
+
+HsaCode*
+BrigObject::getKernel(const std::string &name) const
+{
+ return nullptr;
+}
+
+HsaCode*
+BrigObject::getFunction(const std::string &name) const
+{
+ for (int i = 0; i < functions.size(); ++i) {
+ if (functions[i]->name() == name) {
+ return functions[i];
+ }
+ }
+
+ return nullptr;
+}
+
+void
+BrigObject::processDirectives(const BrigBase *dirPtr, const BrigBase *endPtr,
+ StorageMap *storageMap)
+{
+ while (dirPtr < endPtr) {
+ if (!dirPtr->byteCount) {
+ fatal("Bad directive size 0\n");
+ }
+
+ // calculate next pointer now so we can override it if needed
+ const BrigBase *nextDirPtr = brigNext(dirPtr);
+
+ DPRINTF(HSAILObject, "Code section entry kind: #%x, byte count: %d\n",
+ dirPtr->kind, dirPtr->byteCount);
+
+ switch (dirPtr->kind) {
+ case BRIG_KIND_DIRECTIVE_FUNCTION:
+ {
+ const BrigDirectiveExecutable *p M5_VAR_USED =
+ reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr);
+
+ DPRINTF(HSAILObject,"DIRECTIVE_FUNCTION: %s offset: "
+ "%d next: %d\n", getString(p->name),
+ p->firstCodeBlockEntry, p->nextModuleEntry);
+
+ if (p->firstCodeBlockEntry != p->nextModuleEntry) {
+ panic("Function calls are not fully supported yet!!: %s\n",
+ getString(p->name));
+
+ const char *name = getString(p->name);
+
+ HsailCode *code_obj = nullptr;
+
+ for (int i = 0; i < functions.size(); ++i) {
+ if (functions[i]->name() == name) {
+ code_obj = functions[i];
+ break;
+ }
+ }
+
+ if (!code_obj) {
+ // create new local storage map for kernel-local symbols
+ code_obj = new HsailCode(name, p, this,
+ new StorageMap(storageMap));
+ functions.push_back(code_obj);
+ } else {
+ panic("Multiple definition of Function!!: %s\n",
+ getString(p->name));
+ }
+
+ }
+ nextDirPtr = getCodeSectionEntry(p->nextModuleEntry);
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_KERNEL:
+ {
+ const BrigDirectiveExecutable *p =
+ reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr);
+
+ DPRINTF(HSAILObject,"DIRECTIVE_KERNEL: %s offset: %d count: "
+ "next: %d\n", getString(p->name),
+ p->firstCodeBlockEntry, p->nextModuleEntry);
+
+ const char *name = getString(p->name);
+
+ if (name[0] == '&')
+ name++;
+
+ std::string str = name;
+ char *temp;
+ int len = str.length();
+
+ if (str[len - 1] >= 'a' && str[len - 1] <= 'z') {
+ temp = new char[str.size() + 1];
+ std::copy(str.begin(), str.end() , temp);
+ temp[str.size()] = '\0';
+ } else {
+ temp = new char[str.size()];
+ std::copy(str.begin(), str.end() - 1 , temp);
+ temp[str.size() - 1 ] = '\0';
+ }
+
+ std::string kernel_name = temp;
+ delete[] temp;
+
+ HsailCode *code_obj = nullptr;
+
+ for (const auto &kernel : kernels) {
+ if (kernel->name() == kernel_name) {
+ code_obj = kernel;
+ break;
+ }
+ }
+
+ if (!code_obj) {
+ // create new local storage map for kernel-local symbols
+ code_obj = new HsailCode(kernel_name, p, this,
+ new StorageMap(storageMap));
+
+ kernels.push_back(code_obj);
+ }
+
+ nextDirPtr = getCodeSectionEntry(p->nextModuleEntry);
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_VARIABLE:
+ {
+ const BrigDirectiveVariable *p =
+ reinterpret_cast<const BrigDirectiveVariable*>(dirPtr);
+
+ uint64_t readonlySize_old =
+ storageMap->getSize(BRIG_SEGMENT_READONLY);
+
+ StorageElement* se = storageMap->addSymbol(p, this);
+
+ DPRINTF(HSAILObject, "DIRECTIVE_VARIABLE, symbol %s\n",
+ getString(p->name));
+
+ if (p->segment == BRIG_SEGMENT_READONLY) {
+ // readonly memory has initialization data
+ uint8_t* readonlyData_old = readonlyData;
+
+ readonlyData =
+ new uint8_t[storageMap->getSize(BRIG_SEGMENT_READONLY)];
+
+ if (p->init) {
+ if ((p->type == BRIG_TYPE_ROIMG) ||
+ (p->type == BRIG_TYPE_WOIMG) ||
+ (p->type == BRIG_TYPE_SAMP) ||
+ (p->type == BRIG_TYPE_SIG32) ||
+ (p->type == BRIG_TYPE_SIG64)) {
+ panic("Read only data type not supported: %s\n",
+ getString(p->name));
+ }
+
+ const BrigOperand *brigOp = getOperand(p->init);
+ assert(brigOp->kind ==
+ BRIG_KIND_OPERAND_CONSTANT_BYTES);
+
+ const Brig::BrigData *operand_data M5_VAR_USED =
+ getBrigBaseData(((BrigOperandConstantBytes*)
+ brigOp)->bytes);
+
+ assert((operand_data->byteCount / 4) > 0);
+
+ uint8_t *symbol_data =
+ (uint8_t*)getData(((BrigOperandConstantBytes*)
+ brigOp)->bytes + 4);
+
+ // copy the old data and add the new data
+ if (readonlySize_old > 0) {
+ memcpy(readonlyData, readonlyData_old,
+ readonlySize_old);
+ }
+
+ memcpy(readonlyData + se->offset, symbol_data,
+ se->size);
+
+ delete[] readonlyData_old;
+ }
+ }
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_LABEL:
+ {
+ const BrigDirectiveLabel M5_VAR_USED *p =
+ reinterpret_cast<const BrigDirectiveLabel*>(dirPtr);
+
+ panic("Label directives cannot be at the module level: %s\n",
+ getString(p->name));
+
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_COMMENT:
+ {
+ const BrigDirectiveComment M5_VAR_USED *p =
+ reinterpret_cast<const BrigDirectiveComment*>(dirPtr);
+
+ DPRINTF(HSAILObject, "DIRECTIVE_COMMENT: %s\n",
+ getString(p->name));
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_LOC:
+ {
+ DPRINTF(HSAILObject, "BRIG_DIRECTIVE_LOC\n");
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_MODULE:
+ {
+ const BrigDirectiveModule M5_VAR_USED *p =
+ reinterpret_cast<const BrigDirectiveModule*>(dirPtr);
+
+ DPRINTF(HSAILObject, "BRIG_DIRECTIVE_MODULE: %s\n",
+ getString(p->name));
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_CONTROL:
+ {
+ DPRINTF(HSAILObject, "DIRECTIVE_CONTROL\n");
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_PRAGMA:
+ {
+ DPRINTF(HSAILObject, "DIRECTIVE_PRAGMA\n");
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_EXTENSION:
+ {
+ DPRINTF(HSAILObject, "DIRECTIVE_EXTENSION\n");
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START:
+ {
+ DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_START\n");
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END:
+ {
+ DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_END\n");
+ }
+ break;
+ default:
+ if (dirPtr->kind >= BRIG_KIND_INST_BEGIN &&
+ dirPtr->kind <= BRIG_KIND_INST_END)
+ break;
+
+ if (dirPtr->kind >= BRIG_KIND_OPERAND_BEGIN &&
+ dirPtr->kind <= BRIG_KIND_OPERAND_END)
+ break;
+
+ warn("Unknown Brig directive kind: %d\n", dirPtr->kind);
+ break;
+ }
+
+ dirPtr = nextDirPtr;
+ }
+}
+
+HsaObject*
+BrigObject::tryFile(const std::string &fname, int len, uint8_t *fileData)
+{
+ const char *brig_ident = "HSA BRIG";
+
+ if (memcmp(brig_ident, fileData, MODULE_IDENTIFICATION_LENGTH))
+ return nullptr;
+
+ return new BrigObject(fname, len, fileData);
+}
+
+BrigObject::BrigObject(const std::string &fname, int len, uint8_t *fileData)
+ : HsaObject(fname), storageMap(new StorageMap())
+{
+ const char *brig_ident = "HSA BRIG";
+ BrigModuleHeader *mod_hdr = (BrigModuleHeader*)fileData;
+
+ fatal_if(memcmp(brig_ident, mod_hdr, MODULE_IDENTIFICATION_LENGTH),
+ "%s is not a BRIG file\n", fname);
+
+ if (mod_hdr->brigMajor != BRIG_VERSION_BRIG_MAJOR ||
+ mod_hdr->brigMinor != BRIG_VERSION_BRIG_MINOR) {
+ fatal("%s: BRIG version mismatch, %d.%d != %d.%d\n",
+ fname, mod_hdr->brigMajor, mod_hdr->brigMinor,
+ BRIG_VERSION_BRIG_MAJOR, BRIG_VERSION_BRIG_MINOR);
+ }
+
+ fatal_if(mod_hdr->sectionCount != NumSectionIndices, "%s: BRIG section "
+ "count (%d) != expected value (%d)\n", fname,
+ mod_hdr->sectionCount, NumSectionIndices);
+
+ for (int i = 0; i < NumSectionIndices; ++i) {
+ sectionInfo[i].ptr = nullptr;
+ }
+
+ uint64_t *sec_idx_table = (uint64_t*)(fileData + mod_hdr->sectionIndex);
+ for (int sec_idx = 0; sec_idx < mod_hdr->sectionCount; ++sec_idx) {
+ uint8_t *sec_hdr_byte_ptr = fileData + sec_idx_table[sec_idx];
+ BrigSectionHeader *sec_hdr = (BrigSectionHeader*)sec_hdr_byte_ptr;
+
+ // It doesn't look like cprintf supports string precision values,
+ // but if this breaks, the right answer is to fix that
+ DPRINTF(HSAILObject, "found section %.*s\n", sec_hdr->nameLength,
+ sec_hdr->name);
+
+ sectionInfo[sec_idx].ptr = new uint8_t[sec_hdr->byteCount];
+ memcpy(sectionInfo[sec_idx].ptr, sec_hdr_byte_ptr, sec_hdr->byteCount);
+ sectionInfo[sec_idx].size = sec_hdr->byteCount;
+ }
+
+ BrigSectionHeader *code_hdr =
+ (BrigSectionHeader*)sectionInfo[CodeSectionIndex].ptr;
+
+ DPRINTF(HSAILObject, "Code section hdr, count: %d, hdr count: %d, "
+ "name len: %d\n", code_hdr->byteCount, code_hdr->headerByteCount,
+ code_hdr->nameLength);
+
+ // start at offset 4 to skip initial null entry (see Brig spec)
+ processDirectives(getCodeSectionEntry(code_hdr->headerByteCount),
+ getCodeSectionEntry(sectionInfo[CodeSectionIndex].size),
+ storageMap);
+
+ delete[] fileData;
+
+ DPRINTF(HSALoader, "BRIG object %s loaded.\n", fname);
+}
+
+BrigObject::~BrigObject()
+{
+ for (int i = 0; i < NumSectionIndices; ++i)
+ if (sectionInfo[i].ptr)
+ delete[] sectionInfo[i].ptr;
+}
diff --git a/src/gpu-compute/brig_object.hh b/src/gpu-compute/brig_object.hh
new file mode 100644
index 000000000..59a585914
--- /dev/null
+++ b/src/gpu-compute/brig_object.hh
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt, Anthony Gutierrez
+ */
+
+#ifndef __BRIG_OBJECT_HH__
+#define __BRIG_OBJECT_HH__
+
+#include <cassert>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "arch/hsail/Brig.h"
+#include "gpu-compute/hsa_object.hh"
+#include "gpu-compute/hsail_code.hh"
+
+class LabelMap;
+class StorageMap;
+
+/* @class BrigObject
+ * this class implements the BRIG loader object, and
+ * is used when the simulator directly executes HSAIL.
+ * this class is responsible for extracting all
+ * information about kernels contained in BRIG format
+ * and converts them to HsailCode objects that are
+ * usable by the simulator and emulated runtime.
+ */
+
+class BrigObject final : public HsaObject
+{
+ public:
+ enum SectionIndex
+ {
+ DataSectionIndex,
+ CodeSectionIndex,
+ OperandsSectionIndex,
+ NumSectionIndices
+ };
+
+ static const char *sectionNames[];
+
+ struct SectionInfo
+ {
+ uint8_t *ptr;
+ int size;
+ };
+
+ static HsaObject* tryFile(const std::string &fname, int len,
+ uint8_t *fileData);
+
+ SectionInfo sectionInfo[NumSectionIndices];
+ const uint8_t *getSectionOffset(enum SectionIndex sec, int offs) const;
+
+ std::vector<HsailCode*> kernels;
+ std::vector<HsailCode*> functions;
+ std::string kern_block_name;
+
+ void processDirectives(const Brig::BrigBase *dirPtr,
+ const Brig::BrigBase *endPtr,
+ StorageMap *storageMap);
+
+ BrigObject(const std::string &fname, int len, uint8_t *fileData);
+ ~BrigObject();
+
+ // eventually these will need to be per-kernel not per-object-file
+ StorageMap *storageMap;
+ LabelMap *labelMap;
+
+ const char* getString(int offs) const;
+ const Brig::BrigData* getBrigBaseData(int offs) const;
+ const uint8_t* getData(int offs) const;
+ const Brig::BrigBase* getCodeSectionEntry(int offs) const;
+ const Brig::BrigOperand* getOperand(int offs) const;
+ unsigned getOperandPtr(int offs, int index) const;
+ const Brig::BrigInstBase* getInst(int offs) const;
+
+ HsaCode* getKernel(const std::string &name) const override;
+ HsaCode* getFunction(const std::string &name) const override;
+
+ int numKernels() const override { return kernels.size(); }
+
+ HsaCode* getKernel(int i) const override { return kernels[i]; }
+
+ // pointer to the current kernel/function we're processing, so elements
+ // under construction can reference it. kinda ugly, but easier
+ // than passing it all over for the few places it's needed.
+ mutable HsailCode *currentCode;
+};
+
+// Utility function to bump Brig item pointer to next element given
+// item size in bytes. Really just an add but with lots of casting.
+template<typename T>
+T*
+brigNext(T *ptr)
+{
+ Brig::BrigBase *base_ptr = (Brig::BrigBase*)ptr;
+ int size = base_ptr->byteCount;
+ assert(size);
+
+ return (T*)((uint8_t*)ptr + size);
+}
+
+#endif // __BRIG_OBJECT_HH__
diff --git a/src/gpu-compute/cl_driver.cc b/src/gpu-compute/cl_driver.cc
new file mode 100644
index 000000000..3b3291c03
--- /dev/null
+++ b/src/gpu-compute/cl_driver.cc
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/cl_driver.hh"
+
+#include "base/intmath.hh"
+#include "cpu/thread_context.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/hsa_code.hh"
+#include "gpu-compute/hsa_kernel_info.hh"
+#include "gpu-compute/hsa_object.hh"
+#include "params/ClDriver.hh"
+#include "sim/process.hh"
+#include "sim/syscall_emul_buf.hh"
+
+ClDriver::ClDriver(ClDriverParams *p)
+ : EmulatedDriver(p), hsaCode(0)
+{
+ for (const auto &codeFile : p->codefile)
+ codeFiles.push_back(&codeFile);
+
+ maxFuncArgsSize = 0;
+
+ for (int i = 0; i < codeFiles.size(); ++i) {
+ HsaObject *obj = HsaObject::createHsaObject(*codeFiles[i]);
+
+ for (int k = 0; k < obj->numKernels(); ++k) {
+ assert(obj->getKernel(k));
+ kernels.push_back(obj->getKernel(k));
+ kernels.back()->setReadonlyData((uint8_t*)obj->readonlyData);
+ int kern_funcargs_size = kernels.back()->funcarg_size;
+ maxFuncArgsSize = maxFuncArgsSize < kern_funcargs_size ?
+ kern_funcargs_size : maxFuncArgsSize;
+ }
+ }
+
+ int name_offs = 0;
+ int code_offs = 0;
+
+ for (int i = 0; i < kernels.size(); ++i) {
+ kernelInfo.push_back(HsaKernelInfo());
+ HsaCode *k = kernels[i];
+
+ k->generateHsaKernelInfo(&kernelInfo[i]);
+
+ kernelInfo[i].name_offs = name_offs;
+ kernelInfo[i].code_offs = code_offs;
+
+ name_offs += k->name().size() + 1;
+ code_offs += k->numInsts() * sizeof(GPUStaticInst*);
+ }
+}
+
+void
+ClDriver::handshake(GpuDispatcher *_dispatcher)
+{
+ dispatcher = _dispatcher;
+ dispatcher->setFuncargsSize(maxFuncArgsSize);
+}
+
+int
+ClDriver::open(LiveProcess *p, ThreadContext *tc, int mode, int flags)
+{
+ int fd = p->allocFD(-1, filename, 0, 0, false);
+ FDEntry *fde = p->getFDEntry(fd);
+ fde->driver = this;
+
+ return fd;
+}
+
+int
+ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req)
+{
+ int index = 2;
+ Addr buf_addr = process->getSyscallArg(tc, index);
+
+ switch (req) {
+ case HSA_GET_SIZES:
+ {
+ TypedBufferArg<HsaDriverSizes> sizes(buf_addr);
+ sizes->num_kernels = kernels.size();
+ sizes->string_table_size = 0;
+ sizes->code_size = 0;
+ sizes->readonly_size = 0;
+
+ if (kernels.size() > 0) {
+ // all kernels will share the same read-only memory
+ sizes->readonly_size =
+ kernels[0]->getSize(HsaCode::MemorySegment::READONLY);
+ // check our assumption
+ for (int i = 1; i<kernels.size(); ++i) {
+ assert(sizes->readonly_size ==
+ kernels[i]->getSize(HsaCode::MemorySegment::READONLY));
+ }
+ }
+
+ for (int i = 0; i < kernels.size(); ++i) {
+ HsaCode *k = kernels[i];
+ // add one for terminating '\0'
+ sizes->string_table_size += k->name().size() + 1;
+ sizes->code_size += k->numInsts() * sizeof(GPUStaticInst*);
+ }
+
+ sizes.copyOut(tc->getMemProxy());
+ }
+ break;
+
+ case HSA_GET_KINFO:
+ {
+ TypedBufferArg<HsaKernelInfo>
+ kinfo(buf_addr, sizeof(HsaKernelInfo) * kernels.size());
+
+ for (int i = 0; i < kernels.size(); ++i) {
+ HsaKernelInfo *ki = &kinfo[i];
+ ki->name_offs = kernelInfo[i].name_offs;
+ ki->code_offs = kernelInfo[i].code_offs;
+ ki->sRegCount = kernelInfo[i].sRegCount;
+ ki->dRegCount = kernelInfo[i].dRegCount;
+ ki->cRegCount = kernelInfo[i].cRegCount;
+ ki->static_lds_size = kernelInfo[i].static_lds_size;
+ ki->private_mem_size = kernelInfo[i].private_mem_size;
+ ki->spill_mem_size = kernelInfo[i].spill_mem_size;
+ }
+
+ kinfo.copyOut(tc->getMemProxy());
+ }
+ break;
+
+ case HSA_GET_STRINGS:
+ {
+ int string_table_size = 0;
+ for (int i = 0; i < kernels.size(); ++i) {
+ HsaCode *k = kernels[i];
+ string_table_size += k->name().size() + 1;
+ }
+
+ BufferArg buf(buf_addr, string_table_size);
+ char *bufp = (char*)buf.bufferPtr();
+
+ for (int i = 0; i < kernels.size(); ++i) {
+ HsaCode *k = kernels[i];
+ const char *n = k->name().c_str();
+
+ // idiomatic string copy
+ while ((*bufp++ = *n++));
+ }
+
+ assert(bufp - (char *)buf.bufferPtr() == string_table_size);
+
+ buf.copyOut(tc->getMemProxy());
+ }
+ break;
+
+ case HSA_GET_READONLY_DATA:
+ {
+ // we can pick any kernel --- they share the same
+ // readonly segment (this assumption is checked in GET_SIZES)
+ uint64_t size =
+ kernels.back()->getSize(HsaCode::MemorySegment::READONLY);
+ BufferArg data(buf_addr, size);
+ char *datap = (char *)data.bufferPtr();
+ memcpy(datap,
+ kernels.back()->readonly_data,
+ size);
+ data.copyOut(tc->getMemProxy());
+ }
+ break;
+
+ case HSA_GET_CODE:
+ {
+ // set hsaCode pointer
+ hsaCode = buf_addr;
+ int code_size = 0;
+
+ for (int i = 0; i < kernels.size(); ++i) {
+ HsaCode *k = kernels[i];
+ code_size += k->numInsts() * sizeof(TheGpuISA::RawMachInst);
+ }
+
+ TypedBufferArg<TheGpuISA::RawMachInst> buf(buf_addr, code_size);
+ TheGpuISA::RawMachInst *bufp = buf;
+
+ int buf_idx = 0;
+
+ for (int i = 0; i < kernels.size(); ++i) {
+ HsaCode *k = kernels[i];
+
+ for (int j = 0; j < k->numInsts(); ++j) {
+ bufp[buf_idx] = k->insts()->at(j);
+ ++buf_idx;
+ }
+ }
+
+ buf.copyOut(tc->getMemProxy());
+ }
+ break;
+
+ case HSA_GET_CU_CNT:
+ {
+ BufferArg buf(buf_addr, sizeof(uint32_t));
+ *((uint32_t*)buf.bufferPtr()) = dispatcher->getNumCUs();
+ buf.copyOut(tc->getMemProxy());
+ }
+ break;
+
+ case HSA_GET_VSZ:
+ {
+ BufferArg buf(buf_addr, sizeof(uint32_t));
+ *((uint32_t*)buf.bufferPtr()) = VSZ;
+ buf.copyOut(tc->getMemProxy());
+ }
+ break;
+
+ default:
+ fatal("ClDriver: bad ioctl %d\n", req);
+ }
+
+ return 0;
+}
+
+const char*
+ClDriver::codeOffToKernelName(uint64_t code_ptr)
+{
+ assert(hsaCode);
+ uint32_t code_offs = code_ptr - hsaCode;
+
+ for (int i = 0; i < kernels.size(); ++i) {
+ if (code_offs == kernelInfo[i].code_offs) {
+ return kernels[i]->name().c_str();
+ }
+ }
+
+ return nullptr;
+}
+
+ClDriver*
+ClDriverParams::create()
+{
+ return new ClDriver(this);
+}
diff --git a/src/gpu-compute/cl_driver.hh b/src/gpu-compute/cl_driver.hh
new file mode 100644
index 000000000..03567bab5
--- /dev/null
+++ b/src/gpu-compute/cl_driver.hh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __CL_DRIVER_HH__
+#define __CL_DRIVER_HH__
+
+#include <vector>
+
+#include "gpu-compute/hsa_kernel_info.hh"
+#include "sim/emul_driver.hh"
+
+class GpuDispatcher;
+class HsaCode;
+class LiveProcess;
+class ThreadContext;
+
+struct ClDriverParams;
+
+class ClDriver final : public EmulatedDriver
+{
+ public:
+ ClDriver(ClDriverParams *p);
+ void handshake(GpuDispatcher *_dispatcher);
+ int open(LiveProcess *p, ThreadContext *tc, int mode, int flags);
+ int ioctl(LiveProcess *p, ThreadContext *tc, unsigned req);
+ const char* codeOffToKernelName(uint64_t code_ptr);
+
+ private:
+ GpuDispatcher *dispatcher;
+
+ std::vector<const std::string*> codeFiles;
+
+ // All the kernels we know about
+ std::vector<HsaCode*> kernels;
+ std::vector<HsaCode*> functions;
+
+ std::vector<HsaKernelInfo> kernelInfo;
+
+ // maximum size necessary for function arguments
+ int maxFuncArgsSize;
+ // The host virtual address for the kernel code
+ uint64_t hsaCode;
+};
+
+#endif // __CL_DRIVER_HH__
diff --git a/src/gpu-compute/cl_event.hh b/src/gpu-compute/cl_event.hh
new file mode 100644
index 000000000..75297a2d2
--- /dev/null
+++ b/src/gpu-compute/cl_event.hh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Marc Orr
+ */
+
+#ifndef __GPU_CL_EVENT_HH__
+#define __GPU_CL_EVENT_HH__
+
+struct HsaQueueEntry;
+
+class _cl_event {
+ public:
+ _cl_event() : done(false), hsaTaskPtr(nullptr), start(0), end(0) { }
+
+ volatile bool done;
+ HsaQueueEntry *hsaTaskPtr;
+ uint64_t start;
+ uint64_t end;
+};
+
+#endif // __GPU_CL_EVENT_HH__
diff --git a/src/gpu-compute/code_enums.hh b/src/gpu-compute/code_enums.hh
new file mode 100644
index 000000000..126cf6c50
--- /dev/null
+++ b/src/gpu-compute/code_enums.hh
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __CODE_ENUMS_HH__
+#define __CODE_ENUMS_HH__
+
+#define IS_OT_GLOBAL(a) ((a)>=Enums::OT_GLOBAL_READ \
+ && (a)<=Enums::OT_GLOBAL_LDAS)
+#define IS_OT_SHARED(a) ((a)>=Enums::OT_SHARED_READ \
+ && (a)<=Enums::OT_SHARED_LDAS)
+#define IS_OT_PRIVATE(a) ((a)>=Enums::OT_PRIVATE_READ \
+ && (a)<=Enums::OT_PRIVATE_LDAS)
+#define IS_OT_SPILL(a) ((a)>=Enums::OT_SPILL_READ \
+ && (a)<=Enums::OT_SPILL_LDAS)
+#define IS_OT_READONLY(a) ((a)>=Enums::OT_READONLY_READ \
+ && (a)<=Enums::OT_READONLY_LDAS)
+#define IS_OT_FLAT(a) ((a)>=Enums::OT_FLAT_READ && (a)<=Enums::OT_FLAT_LDAS)
+
+#define IS_OT_LDAS(a) ((a)==Enums::OT_GLOBAL_LDAS||(a)==Enums::OT_SHARED_LDAS \
+ ||(a)==Enums::OT_PRIVATE_LDAS||(a)==Enums::OT_SPILL_LDAS \
+ ||(a)==Enums::OT_READONLY_LDAS||(a)==Enums::OT_FLAT_LDAS)
+
+#define IS_OT_READ(a) ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SHARED_READ \
+ ||(a)==Enums::OT_PRIVATE_READ||(a)==Enums::OT_SPILL_READ \
+ ||(a)==Enums::OT_READONLY_READ||(a)==Enums::OT_FLAT_READ)
+
+#define IS_OT_READ_GM(a) \
+ ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SPILL_READ \
+ ||(a)==Enums::OT_READONLY_READ)
+
+#define IS_OT_READ_LM(a) ((a)==Enums::OT_SHARED_READ)
+
+#define IS_OT_READ_RM(a) ((a)==Enums::OT_READONLY_READ)
+
+#define IS_OT_READ_PM(a) ((a)==Enums::OT_PRIVATE_READ)
+
+#define IS_OT_WRITE(a) \
+ ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SHARED_WRITE \
+ ||(a)==Enums::OT_PRIVATE_WRITE||(a)==Enums::OT_SPILL_WRITE \
+ ||(a)==Enums::OT_READONLY_WRITE||(a)==Enums::OT_FLAT_WRITE)
+
+#define IS_OT_WRITE_GM(a) \
+ ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SPILL_WRITE \
+ ||(a)==Enums::OT_READONLY_WRITE)
+
+#define IS_OT_WRITE_LM(a) ((a)==Enums::OT_SHARED_WRITE)
+
+#define IS_OT_WRITE_PM(a) ((a)==Enums::OT_PRIVATE_WRITE)
+
+#define IS_OT_ATOMIC(a) ((a)==Enums::OT_GLOBAL_ATOMIC \
+ ||(a)==Enums::OT_SHARED_ATOMIC \
+ ||(a)==Enums::OT_PRIVATE_ATOMIC \
+ ||(a)==Enums::OT_SPILL_ATOMIC \
+ ||(a)==Enums::OT_READONLY_ATOMIC \
+ ||(a)==Enums::OT_FLAT_ATOMIC)
+
+#define IS_OT_ATOMIC_GM(a) ((a)==Enums::OT_GLOBAL_ATOMIC \
+ ||(a)==Enums::OT_SPILL_ATOMIC \
+ ||(a)==Enums::OT_READONLY_ATOMIC \
+ ||(a)==Enums::OT_GLOBAL_MEMFENCE \
+ ||(a)==Enums::OT_BOTH_MEMFENCE)
+
+#define IS_OT_ATOMIC_LM(a) ((a)==Enums::OT_SHARED_ATOMIC \
+ ||(a)==Enums::OT_SHARED_MEMFENCE \
+ ||(a)==Enums::OT_BOTH_MEMFENCE)
+
+#define IS_OT_ATOMIC_PM(a) ((a)==Enums::OT_PRIVATE_ATOMIC)
+
+#define IS_OT_HIST(a) ((a)==Enums::OT_GLOBAL_HIST \
+ ||(a)==Enums::OT_SHARED_HIST \
+ ||(a)==Enums::OT_PRIVATE_HIST \
+ ||(a)==Enums::OT_SPILL_HIST \
+ ||(a)==Enums::OT_READONLY_HIST \
+ ||(a)==Enums::OT_FLAT_HIST)
+
+#define IS_OT_HIST_GM(a) ((a)==Enums::OT_GLOBAL_HIST \
+ ||(a)==Enums::OT_SPILL_HIST \
+ ||(a)==Enums::OT_READONLY_HIST)
+
+#define IS_OT_HIST_LM(a) ((a)==Enums::OT_SHARED_HIST)
+
+#define IS_OT_HIST_PM(a) ((a)==Enums::OT_PRIVATE_HIST)
+
+#endif // __CODE_ENUMS_HH__
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
new file mode 100644
index 000000000..d3622007a
--- /dev/null
+++ b/src/gpu-compute/compute_unit.cc
@@ -0,0 +1,1817 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Anthony Gutierrez
+ */
+
+#include "gpu-compute/compute_unit.hh"
+
+#include "base/output.hh"
+#include "debug/GPUDisp.hh"
+#include "debug/GPUExec.hh"
+#include "debug/GPUFetch.hh"
+#include "debug/GPUMem.hh"
+#include "debug/GPUPort.hh"
+#include "debug/GPUPrefetch.hh"
+#include "debug/GPUSync.hh"
+#include "debug/GPUTLB.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/ndrange.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/simple_pool_manager.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/page_table.hh"
+#include "sim/process.hh"
+
+ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
+ scoreboardCheckStage(p), scheduleStage(p), execStage(p),
+ globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0),
+ cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs),
+ spBypassPipeLength(p->spbypass_pipe_length),
+ dpBypassPipeLength(p->dpbypass_pipe_length),
+ issuePeriod(p->issue_period),
+ numGlbMemUnits(p->num_global_mem_pipes),
+ numLocMemUnits(p->num_shared_mem_pipes),
+ perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
+ prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
+ xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault),
+ functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
+ countPages(p->countPages), barrier_id(0),
+ vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
+ coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
+ req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
+ resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
+ _masterId(p->system->getMasterId(name() + ".ComputeUnit")),
+ lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize)
+{
+ // this check will be eliminated once we have wavefront size support added
+ fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ");
+ // calculate how many cycles a vector load or store will need to transfer
+ // its data over the corresponding buses
+ numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t))
+ / (double)vrfToCoalescerBusWidth);
+
+ numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t))
+ / coalescerToVrfBusWidth;
+
+ lastVaddrWF.resize(numSIMDs);
+ wfList.resize(numSIMDs);
+
+ for (int j = 0; j < numSIMDs; ++j) {
+ lastVaddrWF[j].resize(p->n_wf);
+
+ for (int i = 0; i < p->n_wf; ++i) {
+ lastVaddrWF[j][i].resize(VSZ);
+
+ wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
+ wfList[j][i]->setParent(this);
+
+ for (int k = 0; k < VSZ; ++k) {
+ lastVaddrWF[j][i][k] = 0;
+ }
+ }
+ }
+
+ lastVaddrPhase.resize(numSIMDs);
+
+ for (int i = 0; i < numSIMDs; ++i) {
+ lastVaddrPhase[i] = LastVaddrWave();
+ }
+
+ lastVaddrCU = LastVaddrWave();
+
+ lds.setParent(this);
+
+ if (p->execPolicy == "OLDEST-FIRST") {
+ exec_policy = EXEC_POLICY::OLDEST;
+ } else if (p->execPolicy == "ROUND-ROBIN") {
+ exec_policy = EXEC_POLICY::RR;
+ } else {
+ fatal("Invalid WF execution policy (CU)\n");
+ }
+
+ memPort.resize(VSZ);
+
+ // resize the tlbPort vectorArray
+ int tlbPort_width = perLaneTLB ? VSZ : 1;
+ tlbPort.resize(tlbPort_width);
+
+ cuExitCallback = new CUExitCallback(this);
+ registerExitCallback(cuExitCallback);
+
+ xactCasLoadMap.clear();
+ lastExecCycle.resize(numSIMDs, 0);
+
+ for (int i = 0; i < vrf.size(); ++i) {
+ vrf[i]->setParent(this);
+ }
+
+ numVecRegsPerSimd = vrf[0]->numRegs();
+}
+
+ComputeUnit::~ComputeUnit()
+{
+ // Delete wavefront slots
+
+ for (int j = 0; j < numSIMDs; ++j)
+ for (int i = 0; i < shader->n_wf; ++i) {
+ delete wfList[j][i];
+ }
+
+ readyList.clear();
+ waveStatusList.clear();
+ dispatchList.clear();
+ vectorAluInstAvail.clear();
+ delete cuExitCallback;
+ delete ldsPort;
+}
+
+void
+ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr)
+{
+ w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
+
+ w->workgroupsz[0] = ndr->q.wgSize[0];
+ w->workgroupsz[1] = ndr->q.wgSize[1];
+ w->workgroupsz[2] = ndr->q.wgSize[2];
+ w->wg_sz = w->workgroupsz[0] * w->workgroupsz[1] * w->workgroupsz[2];
+ w->gridsz[0] = ndr->q.gdSize[0];
+ w->gridsz[1] = ndr->q.gdSize[1];
+ w->gridsz[2] = ndr->q.gdSize[2];
+ w->kernelArgs = ndr->q.args;
+ w->privSizePerItem = ndr->q.privMemPerItem;
+ w->spillSizePerItem = ndr->q.spillMemPerItem;
+ w->roBase = ndr->q.roMemStart;
+ w->roSize = ndr->q.roMemTotal;
+}
+
+void
+ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
+ int trueWgSize[], int trueWgSizeTotal,
+ LdsChunk *ldsChunk, uint64_t origSpillMemStart)
+{
+ wfCtx->cnt = cnt;
+
+ VectorMask init_mask;
+ init_mask.reset();
+
+ for (int k = 0; k < VSZ; ++k) {
+ if (k + cnt * VSZ < trueWgSizeTotal)
+ init_mask[k] = 1;
+ }
+
+ wfCtx->init_mask = init_mask.to_ullong();
+ wfCtx->exec_mask = init_mask.to_ullong();
+
+ for (int i = 0; i < VSZ; ++i) {
+ wfCtx->bar_cnt[i] = 0;
+ }
+
+ wfCtx->max_bar_cnt = 0;
+ wfCtx->old_barrier_cnt = 0;
+ wfCtx->barrier_cnt = 0;
+
+ wfCtx->privBase = ndr->q.privMemStart;
+ ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ;
+
+ wfCtx->spillBase = ndr->q.spillMemStart;
+ ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ;
+
+ wfCtx->pc = 0;
+ wfCtx->rpc = UINT32_MAX;
+
+ // set the wavefront context to have a pointer to this section of the LDS
+ wfCtx->ldsChunk = ldsChunk;
+
+ // WG state
+ wfCtx->wg_id = ndr->globalWgId;
+ wfCtx->barrier_id = barrier_id;
+
+ // Kernel wide state
+ wfCtx->ndr = ndr;
+}
+
+void
+ComputeUnit::updateEvents() {
+
+ if (!timestampVec.empty()) {
+ uint32_t vecSize = timestampVec.size();
+ uint32_t i = 0;
+ while (i < vecSize) {
+ if (timestampVec[i] <= shader->tick_cnt) {
+ std::pair<uint32_t, uint32_t> regInfo = regIdxVec[i];
+ vrf[regInfo.first]->markReg(regInfo.second, sizeof(uint32_t),
+ statusVec[i]);
+ timestampVec.erase(timestampVec.begin() + i);
+ regIdxVec.erase(regIdxVec.begin() + i);
+ statusVec.erase(statusVec.begin() + i);
+ --vecSize;
+ --i;
+ }
+ ++i;
+ }
+ }
+
+ for (int i = 0; i< numSIMDs; ++i) {
+ vrf[i]->updateEvents();
+ }
+}
+
+
+void
+ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
+ int trueWgSizeTotal)
+{
+ static int _n_wave = 0;
+ int cnt = wfCtx->cnt;
+ NDRange *ndr = wfCtx->ndr;
+
+ // Fill in Kernel state
+ FillKernelState(w, ndr);
+
+ w->kern_id = ndr->dispatchId;
+ w->dynwaveid = cnt;
+ w->init_mask = wfCtx->init_mask;
+
+ for (int k = 0; k < VSZ; ++k) {
+ w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0];
+ w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1];
+ w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]);
+
+ w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
+ trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
+ w->workitemid[0][k];
+ }
+
+ w->old_barrier_cnt = wfCtx->old_barrier_cnt;
+ w->barrier_cnt = wfCtx->barrier_cnt;
+ w->barrier_slots = divCeil(trueWgSizeTotal, VSZ);
+
+ for (int i = 0; i < VSZ; ++i) {
+ w->bar_cnt[i] = wfCtx->bar_cnt[i];
+ }
+
+ w->max_bar_cnt = wfCtx->max_bar_cnt;
+ w->privBase = wfCtx->privBase;
+ w->spillBase = wfCtx->spillBase;
+
+ w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask);
+
+ // WG state
+ w->wg_id = wfCtx->wg_id;
+ w->dispatchid = wfCtx->ndr->dispatchId;
+ w->workgroupid[0] = w->wg_id % ndr->numWg[0];
+ w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1];
+ w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]);
+
+ w->barrier_id = wfCtx->barrier_id;
+ w->stalledAtBarrier = false;
+
+ // move this from the context into the actual wavefront
+ w->ldsChunk = wfCtx->ldsChunk;
+
+ int32_t refCount M5_VAR_USED =
+ lds.increaseRefCounter(w->dispatchid, w->wg_id);
+ DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
+ cu_id, w->wg_id, refCount);
+
+ w->instructionBuffer.clear();
+
+ if (w->pendingFetch)
+ w->dropFetch = true;
+
+ // is this the last wavefront in the workgroup
+ // if set the spillWidth to be the remaining work-items
+ // so that the vector access is correct
+ if ((cnt + 1) * VSZ >= trueWgSizeTotal) {
+ w->spillWidth = trueWgSizeTotal - (cnt * VSZ);
+ } else {
+ w->spillWidth = VSZ;
+ }
+
+ DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
+ "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
+
+ w->start(++_n_wave, ndr->q.code_ptr);
+}
+
+void
+ComputeUnit::StartWorkgroup(NDRange *ndr)
+{
+ // reserve the LDS capacity allocated to the work group
+ // disambiguated by the dispatch ID and workgroup ID, which should be
+ // globally unique
+ LdsChunk *ldsChunk = lds.reserveSpace(ndr->dispatchId, ndr->globalWgId,
+ ndr->q.ldsSize);
+
+ // Send L1 cache acquire
+ // isKernel + isAcquire = Kernel Begin
+ if (shader->impl_kern_boundary_sync) {
+ GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr,
+ nullptr,
+ nullptr, 0);
+
+ gpuDynInst->useContinuation = false;
+ gpuDynInst->memoryOrder = Enums::MEMORY_ORDER_SC_ACQUIRE;
+ gpuDynInst->scope = Enums::MEMORY_SCOPE_SYSTEM;
+ injectGlobalMemFence(gpuDynInst, true);
+ }
+
+ // Get true size of workgroup (after clamping to grid size)
+ int trueWgSize[3];
+ int trueWgSizeTotal = 1;
+
+ for (int d = 0; d < 3; ++d) {
+ trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
+ ndr->wgId[d] * ndr->q.wgSize[d]);
+
+ trueWgSizeTotal *= trueWgSize[d];
+ }
+
+ uint64_t origSpillMemStart = ndr->q.spillMemStart;
+ // calculate the number of 32-bit vector registers required by wavefront
+ int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
+ int cnt = 0;
+
+ // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
+ for (int m = 0; m < shader->n_wf * numSIMDs; ++m) {
+ Wavefront *w = wfList[m % numSIMDs][m / numSIMDs];
+ // Check if this wavefront slot is available:
+ // It must be stopped and not waiting
+ // for a release to complete S_RETURNING
+ if (w->status == Wavefront::S_STOPPED) {
+ // if we have scheduled all work items then stop
+ // scheduling wavefronts
+ if (cnt * VSZ >= trueWgSizeTotal)
+ break;
+
+ // reserve vector registers for the scheduled wavefront
+ assert(vectorRegsReserved[m % numSIMDs] <= numVecRegsPerSimd);
+ uint32_t normSize = 0;
+
+ w->startVgprIndex = vrf[m % numSIMDs]->manager->
+ allocateRegion(vregDemand, &normSize);
+
+ w->reservedVectorRegs = normSize;
+ vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
+
+ WFContext wfCtx;
+
+ InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal,
+ ldsChunk, origSpillMemStart);
+
+ StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal);
+ ++cnt;
+ }
+ }
+ ++barrier_id;
+}
+
+int
+ComputeUnit::ReadyWorkgroup(NDRange *ndr)
+{
+ // Get true size of workgroup (after clamping to grid size)
+ int trueWgSize[3];
+ int trueWgSizeTotal = 1;
+
+ for (int d = 0; d < 3; ++d) {
+ trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
+ ndr->wgId[d] * ndr->q.wgSize[d]);
+
+ trueWgSizeTotal *= trueWgSize[d];
+ DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
+ }
+
+ DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
+
+ // calculate the number of 32-bit vector registers required by each
+ // work item of the work group
+ int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
+ bool vregAvail = true;
+ int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ;
+ int freeWfSlots = 0;
+ // check if the total number of VGPRs required by all WFs of the WG
+ // fit in the VRFs of all SIMD units
+ assert((numWfs * vregDemandPerWI) <= (numSIMDs * numVecRegsPerSimd));
+ int numMappedWfs = 0;
+ std::vector<int> numWfsPerSimd;
+ numWfsPerSimd.resize(numSIMDs, 0);
+ // find how many free WF slots we have across all SIMDs
+ for (int j = 0; j < shader->n_wf; ++j) {
+ for (int i = 0; i < numSIMDs; ++i) {
+ if (wfList[i][j]->status == Wavefront::S_STOPPED) {
+ // count the number of free WF slots
+ ++freeWfSlots;
+ if (numMappedWfs < numWfs) {
+ // count the WFs to be assigned per SIMD
+ numWfsPerSimd[i]++;
+ }
+ numMappedWfs++;
+ }
+ }
+ }
+
+ // if there are enough free WF slots then find if there are enough
+ // free VGPRs per SIMD based on the WF->SIMD mapping
+ if (freeWfSlots >= numWfs) {
+ for (int j = 0; j < numSIMDs; ++j) {
+ // find if there are enough free VGPR regions in the SIMD's VRF
+ // to accommodate the WFs of the new WG that would be mapped to
+ // this SIMD unit
+ vregAvail = vrf[j]->manager->canAllocate(numWfsPerSimd[j],
+ vregDemandPerWI);
+
+ // stop searching if there is at least one SIMD
+ // whose VRF does not have enough free VGPR pools.
+ // This is because a WG is scheduled only if ALL
+ // of its WFs can be scheduled
+ if (!vregAvail)
+ break;
+ }
+ }
+
+ DPRINTF(GPUDisp, "Free WF slots = %d, VGPR Availability = %d\n",
+ freeWfSlots, vregAvail);
+
+ if (!vregAvail) {
+ ++numTimesWgBlockedDueVgprAlloc;
+ }
+
+ // Return true if enough WF slots to submit workgroup and if there are
+ // enough VGPRs to schedule all WFs to their SIMD units
+ if (!lds.canReserve(ndr->q.ldsSize)) {
+ wgBlockedDueLdsAllocation++;
+ }
+
+ // Return true if (a) there are enough free WF slots to submit
+ // workgrounp and (b) if there are enough VGPRs to schedule all WFs to their
+ // SIMD units and (c) if there is enough space in LDS
+ return freeWfSlots >= numWfs && vregAvail && lds.canReserve(ndr->q.ldsSize);
+}
+
+int
+ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
+{
+ DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id);
+ int ccnt = 0;
+
+ for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) {
+ for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
+ Wavefront *w = wfList[i_simd][i_wf];
+
+ if (w->status == Wavefront::S_RUNNING) {
+ DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
+
+ DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
+ w->barrier_id, _barrier_id);
+
+ DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n",
+ w->barrier_cnt, bcnt);
+ }
+
+ if (w->status == Wavefront::S_RUNNING &&
+ w->barrier_id == _barrier_id && w->barrier_cnt == bcnt &&
+ !w->outstanding_reqs) {
+ ++ccnt;
+
+ DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to "
+ "%d\n", i_simd, i_wf, ccnt);
+ }
+ }
+ }
+
+ DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
+ cu_id, ccnt, bslots);
+
+ return ccnt == bslots;
+}
+
+// Check if the current wavefront is blocked on additional resources.
+bool
+ComputeUnit::cedeSIMD(int simdId, int wfSlotId)
+{
+ bool cede = false;
+
+ // If --xact-cas-mode option is enabled in run.py, then xact_cas_ld
+ // magic instructions will impact the scheduling of wavefronts
+ if (xact_cas_mode) {
+ /*
+ * When a wavefront calls xact_cas_ld, it adds itself to a per address
+ * queue. All per address queues are managed by the xactCasLoadMap.
+ *
+ * A wavefront is not blocked if: it is not in ANY per address queue or
+ * if it is at the head of a per address queue.
+ */
+ for (auto itMap : xactCasLoadMap) {
+ std::list<waveIdentifier> curWaveIDQueue = itMap.second.waveIDQueue;
+
+ if (!curWaveIDQueue.empty()) {
+ for (auto it : curWaveIDQueue) {
+ waveIdentifier cur_wave = it;
+
+ if (cur_wave.simdId == simdId &&
+ cur_wave.wfSlotId == wfSlotId) {
+ // 2 possibilities
+ // 1: this WF has a green light
+ // 2: another WF has a green light
+ waveIdentifier owner_wave = curWaveIDQueue.front();
+
+ if (owner_wave.simdId != cur_wave.simdId ||
+ owner_wave.wfSlotId != cur_wave.wfSlotId) {
+ // possibility 2
+ cede = true;
+ break;
+ } else {
+ // possibility 1
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return cede;
+}
+
+// Execute one clock worth of work on the ComputeUnit.
+void
+ComputeUnit::exec()
+{
+ updateEvents();
+ // Execute pipeline stages in reverse order to simulate
+ // the pipeline latency
+ globalMemoryPipe.exec();
+ localMemoryPipe.exec();
+ execStage.exec();
+ scheduleStage.exec();
+ scoreboardCheckStage.exec();
+ fetchStage.exec();
+
+ totalCycles++;
+}
+
+void
+ComputeUnit::init()
+{
+ // Initialize CU Bus models
+ glbMemToVrfBus.init(&shader->tick_cnt, 1);
+ locMemToVrfBus.init(&shader->tick_cnt, 1);
+ nextGlbMemBus = 0;
+ nextLocMemBus = 0;
+ fatal_if(numGlbMemUnits > 1,
+ "No support for multiple Global Memory Pipelines exists!!!");
+ vrfToGlobalMemPipeBus.resize(numGlbMemUnits);
+ for (int j = 0; j < numGlbMemUnits; ++j) {
+ vrfToGlobalMemPipeBus[j] = WaitClass();
+ vrfToGlobalMemPipeBus[j].init(&shader->tick_cnt, 1);
+ }
+
+ fatal_if(numLocMemUnits > 1,
+ "No support for multiple Local Memory Pipelines exists!!!");
+ vrfToLocalMemPipeBus.resize(numLocMemUnits);
+ for (int j = 0; j < numLocMemUnits; ++j) {
+ vrfToLocalMemPipeBus[j] = WaitClass();
+ vrfToLocalMemPipeBus[j].init(&shader->tick_cnt, 1);
+ }
+ vectorRegsReserved.resize(numSIMDs, 0);
+ aluPipe.resize(numSIMDs);
+ wfWait.resize(numSIMDs + numLocMemUnits + numGlbMemUnits);
+
+ for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) {
+ wfWait[i] = WaitClass();
+ wfWait[i].init(&shader->tick_cnt, 1);
+ }
+
+ for (int i = 0; i < numSIMDs; ++i) {
+ aluPipe[i] = WaitClass();
+ aluPipe[i].init(&shader->tick_cnt, 1);
+ }
+
+ // Setup space for call args
+ for (int j = 0; j < numSIMDs; ++j) {
+ for (int i = 0; i < shader->n_wf; ++i) {
+ wfList[j][i]->initCallArgMem(shader->funcargs_size);
+ }
+ }
+
+ // Initializing pipeline resources
+ readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits);
+ waveStatusList.resize(numSIMDs);
+
+ for (int j = 0; j < numSIMDs; ++j) {
+ for (int i = 0; i < shader->n_wf; ++i) {
+ waveStatusList[j].push_back(
+ std::make_pair(wfList[j][i], BLOCKED));
+ }
+ }
+
+ for (int j = 0; j < (numSIMDs + numGlbMemUnits + numLocMemUnits); ++j) {
+ dispatchList.push_back(std::make_pair((Wavefront*)nullptr, EMPTY));
+ }
+
+ fetchStage.init(this);
+ scoreboardCheckStage.init(this);
+ scheduleStage.init(this);
+ execStage.init(this);
+ globalMemoryPipe.init(this);
+ localMemoryPipe.init(this);
+ // initialize state for statistics calculation
+ vectorAluInstAvail.resize(numSIMDs, false);
+ shrMemInstAvail = 0;
+ glbMemInstAvail = 0;
+}
+
+bool
+ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
+{
+ // Ruby has completed the memory op. Schedule the mem_resp_event at the
+ // appropriate cycle to process the timing memory response
+ // This delay represents the pipeline delay
+ SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+ int index = sender_state->port_index;
+ GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+
+ // Is the packet returned a Kernel End or Barrier
+ if (pkt->req->isKernel() && pkt->req->isRelease()) {
+ Wavefront *w =
+ computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
+
+ // Check if we are waiting on Kernel End Release
+ if (w->status == Wavefront::S_RETURNING) {
+ DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
+ computeUnit->cu_id, w->simdId, w->wfSlotId,
+ w->wfDynId, w->kern_id);
+
+ computeUnit->shader->dispatcher->notifyWgCompl(w);
+ w->status = Wavefront::S_STOPPED;
+ } else {
+ w->outstanding_reqs--;
+ }
+
+ DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n",
+ computeUnit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, w->barrier_cnt);
+
+ if (gpuDynInst->useContinuation) {
+ assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+ gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+ gpuDynInst);
+ }
+
+ delete pkt->senderState;
+ delete pkt->req;
+ delete pkt;
+ return true;
+ } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
+ if (gpuDynInst->useContinuation) {
+ assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+ gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+ gpuDynInst);
+ }
+
+ delete pkt->senderState;
+ delete pkt->req;
+ delete pkt;
+ return true;
+ }
+
+ ComputeUnit::DataPort::MemRespEvent *mem_resp_event =
+ new ComputeUnit::DataPort::MemRespEvent(computeUnit->memPort[index],
+ pkt);
+
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
+ computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+ index, pkt->req->getPaddr());
+
+ computeUnit->schedule(mem_resp_event,
+ curTick() + computeUnit->resp_tick_latency);
+ return true;
+}
+
+void
+ComputeUnit::DataPort::recvReqRetry()
+{
+ int len = retries.size();
+
+ assert(len > 0);
+
+ for (int i = 0; i < len; ++i) {
+ PacketPtr pkt = retries.front().first;
+ GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
+ computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+ pkt->req->getPaddr());
+
+ /** Currently Ruby can return false due to conflicts for the particular
+ * cache block or address. Thus other requests should be allowed to
+ * pass and the data port should expect multiple retries. */
+ if (!sendTimingReq(pkt)) {
+ DPRINTF(GPUMem, "failed again!\n");
+ break;
+ } else {
+ DPRINTF(GPUMem, "successful!\n");
+ retries.pop_front();
+ }
+ }
+}
+
+bool
+ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
+{
+ computeUnit->fetchStage.processFetchReturn(pkt);
+
+ return true;
+}
+
+void
+ComputeUnit::SQCPort::recvReqRetry()
+{
+ int len = retries.size();
+
+ assert(len > 0);
+
+ for (int i = 0; i < len; ++i) {
+ PacketPtr pkt = retries.front().first;
+ Wavefront *wavefront M5_VAR_USED = retries.front().second;
+ DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
+ computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+ pkt->req->getPaddr());
+ if (!sendTimingReq(pkt)) {
+ DPRINTF(GPUFetch, "failed again!\n");
+ break;
+ } else {
+ DPRINTF(GPUFetch, "successful!\n");
+ retries.pop_front();
+ }
+ }
+}
+
+void
+ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
+{
+ // There must be a way around this check to do the globalMemStart...
+ Addr tmp_vaddr = pkt->req->getVaddr();
+
+ updatePageDivergenceDist(tmp_vaddr);
+
+ pkt->req->setVirt(pkt->req->getAsid(), tmp_vaddr, pkt->req->getSize(),
+ pkt->req->getFlags(), pkt->req->masterId(),
+ pkt->req->getPC());
+
+ // figure out the type of the request to set read/write
+ BaseTLB::Mode TLB_mode;
+ assert(pkt->isRead() || pkt->isWrite());
+
+ // Check write before read for atomic operations
+ // since atomic operations should use BaseTLB::Write
+ if (pkt->isWrite()){
+ TLB_mode = BaseTLB::Write;
+ } else if (pkt->isRead()) {
+ TLB_mode = BaseTLB::Read;
+ } else {
+ fatal("pkt is not a read nor a write\n");
+ }
+
+ tlbCycles -= curTick();
+ ++tlbRequests;
+
+ int tlbPort_index = perLaneTLB ? index : 0;
+
+ if (shader->timingSim) {
+ if (debugSegFault) {
+ Process *p = shader->gpuTc->getProcessPtr();
+ Addr vaddr = pkt->req->getVaddr();
+ unsigned size = pkt->getSize();
+
+ if ((vaddr + size - 1) % 64 < vaddr % 64) {
+ panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
+ cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
+ }
+
+ Addr paddr;
+
+ if (!p->pTable->translate(vaddr, paddr)) {
+ if (!p->fixupStackFault(vaddr)) {
+ panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
+ cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+ vaddr);
+ }
+ }
+ }
+
+ // This is the SenderState needed upon return
+ pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
+
+ // This is the senderState needed by the TLB hierarchy to function
+ TheISA::GpuTLB::TranslationState *translation_state =
+ new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
+ pkt->senderState);
+
+ pkt->senderState = translation_state;
+
+ if (functionalTLB) {
+ tlbPort[tlbPort_index]->sendFunctional(pkt);
+
+ // update the hitLevel distribution
+ int hit_level = translation_state->hitLevel;
+ assert(hit_level != -1);
+ hitsPerTLBLevel[hit_level]++;
+
+ // New SenderState for the memory access
+ X86ISA::GpuTLB::TranslationState *sender_state =
+ safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ delete sender_state->tlbEntry;
+ delete sender_state->saved;
+ delete sender_state;
+
+ assert(pkt->req->hasPaddr());
+ assert(pkt->req->hasSize());
+
+ uint8_t *tmpData = pkt->getPtr<uint8_t>();
+
+ // this is necessary because the GPU TLB receives packets instead
+ // of requests. when the translation is complete, all relevent
+ // fields in the request will be populated, but not in the packet.
+ // here we create the new packet so we can set the size, addr,
+ // and proper flags.
+ PacketPtr oldPkt = pkt;
+ pkt = new Packet(oldPkt->req, oldPkt->cmd);
+ delete oldPkt;
+ pkt->dataStatic(tmpData);
+
+
+ // New SenderState for the memory access
+ pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst,
+ index, nullptr);
+
+ gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
+ gpuDynInst->tlbHitLevel[index] = hit_level;
+
+
+ // translation is done. Schedule the mem_req_event at the
+ // appropriate cycle to send the timing memory request to ruby
+ ComputeUnit::DataPort::MemReqEvent *mem_req_event =
+ new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt);
+
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
+ "scheduled\n", cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
+
+ schedule(mem_req_event, curTick() + req_tick_latency);
+ } else if (tlbPort[tlbPort_index]->isStalled()) {
+ assert(tlbPort[tlbPort_index]->retries.size() > 0);
+
+ DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
+ "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+ tmp_vaddr);
+
+ tlbPort[tlbPort_index]->retries.push_back(pkt);
+ } else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
+ // Stall the data port;
+ // No more packet will be issued till
+ // ruby indicates resources are freed by
+ // a recvReqRetry() call back on this port.
+ tlbPort[tlbPort_index]->stallPort();
+
+ DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
+ "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+ tmp_vaddr);
+
+ tlbPort[tlbPort_index]->retries.push_back(pkt);
+ } else {
+ DPRINTF(GPUTLB,
+ "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
+ cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
+ }
+ } else {
+ if (pkt->cmd == MemCmd::MemFenceReq) {
+ gpuDynInst->statusBitVector = VectorMask(0);
+ } else {
+ gpuDynInst->statusBitVector &= (~(1ll << index));
+ }
+
+ // New SenderState for the memory access
+ delete pkt->senderState;
+
+ // Because it's atomic operation, only need TLB translation state
+ pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode,
+ shader->gpuTc);
+
+ tlbPort[tlbPort_index]->sendFunctional(pkt);
+
+ // the addr of the packet is not modified, so we need to create a new
+ // packet, or otherwise the memory access will have the old virtual
+ // address sent in the translation packet, instead of the physical
+ // address returned by the translation.
+ PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
+ new_pkt->dataStatic(pkt->getPtr<uint8_t>());
+
+ // Translation is done. It is safe to send the packet to memory.
+ memPort[0]->sendFunctional(new_pkt);
+
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
+ gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
+ new_pkt->req->getPaddr());
+
+ // safe_cast the senderState
+ TheISA::GpuTLB::TranslationState *sender_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ delete sender_state->tlbEntry;
+ delete new_pkt;
+ delete pkt->senderState;
+ delete pkt->req;
+ delete pkt;
+ }
+}
+
+void
+ComputeUnit::sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
+{
+ ComputeUnit::DataPort::MemReqEvent *mem_req_event =
+ new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt);
+
+
+ // New SenderState for the memory access
+ pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
+ nullptr);
+
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
+ cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
+ pkt->req->getPaddr());
+
+ schedule(mem_req_event, curTick() + req_tick_latency);
+}
+
+void
+ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch,
+ Request* req)
+{
+ if (!req) {
+ req = new Request(0, 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId, -1);
+ }
+ req->setPaddr(0);
+ if (kernelLaunch) {
+ req->setFlags(Request::KERNEL);
+ }
+
+ gpuDynInst->s_type = SEG_GLOBAL;
+
+ // for non-kernel MemFence operations, memorder flags are set depending
+ // on which type of request is currently being sent, so this
+ // should be set by the caller (e.g. if an inst has acq-rel
+ // semantics, it will send one acquire req an one release req)
+ gpuDynInst->setRequestFlags(req, kernelLaunch);
+
+ // a mem fence must correspond to an acquire/release request
+ assert(req->isAcquire() || req->isRelease());
+
+ // create packet
+ PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq);
+
+ // set packet's sender state
+ pkt->senderState =
+ new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr);
+
+ // send the packet
+ sendSyncRequest(gpuDynInst, 0, pkt);
+}
+
+const char*
+ComputeUnit::DataPort::MemRespEvent::description() const
+{
+ return "ComputeUnit memory response event";
+}
+
+void
+ComputeUnit::DataPort::MemRespEvent::process()
+{
+ DataPort::SenderState *sender_state =
+ safe_cast<DataPort::SenderState*>(pkt->senderState);
+
+ GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+ ComputeUnit *compute_unit = dataPort->computeUnit;
+
+ assert(gpuDynInst);
+
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
+ compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+ pkt->req->getPaddr(), dataPort->index);
+
+ Addr paddr = pkt->req->getPaddr();
+
+ if (pkt->cmd != MemCmd::MemFenceResp) {
+ int index = gpuDynInst->memStatusVector[paddr].back();
+
+ DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
+ pkt->req->getPaddr(), index);
+
+ gpuDynInst->memStatusVector[paddr].pop_back();
+ gpuDynInst->pAddr = pkt->req->getPaddr();
+
+ if (pkt->isRead() || pkt->isWrite()) {
+
+ if (gpuDynInst->n_reg <= MAX_REGS_FOR_NON_VEC_MEM_INST) {
+ gpuDynInst->statusBitVector &= (~(1ULL << index));
+ } else {
+ assert(gpuDynInst->statusVector[index] > 0);
+ gpuDynInst->statusVector[index]--;
+
+ if (!gpuDynInst->statusVector[index])
+ gpuDynInst->statusBitVector &= (~(1ULL << index));
+ }
+
+ DPRINTF(GPUMem, "bitvector is now %#x\n",
+ gpuDynInst->statusBitVector);
+
+ if (gpuDynInst->statusBitVector == VectorMask(0)) {
+ auto iter = gpuDynInst->memStatusVector.begin();
+ auto end = gpuDynInst->memStatusVector.end();
+
+ while (iter != end) {
+ assert(iter->second.empty());
+ ++iter;
+ }
+
+ gpuDynInst->memStatusVector.clear();
+
+ if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
+ gpuDynInst->statusVector.clear();
+
+ if (gpuDynInst->m_op == Enums::MO_LD || MO_A(gpuDynInst->m_op)
+ || MO_ANR(gpuDynInst->m_op)) {
+ assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy());
+
+ compute_unit->globalMemoryPipe.getGMLdRespFIFO()
+ .push(gpuDynInst);
+ } else {
+ assert(compute_unit->globalMemoryPipe.isGMStRespFIFOWrRdy());
+
+ compute_unit->globalMemoryPipe.getGMStRespFIFO()
+ .push(gpuDynInst);
+ }
+
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
+ compute_unit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId);
+
+ // after clearing the status vectors,
+ // see if there is a continuation to perform
+ // the continuation may generate more work for
+ // this memory request
+ if (gpuDynInst->useContinuation) {
+ assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+ gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+ gpuDynInst);
+ }
+ }
+ }
+ } else {
+ gpuDynInst->statusBitVector = VectorMask(0);
+
+ if (gpuDynInst->useContinuation) {
+ assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+ gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+ gpuDynInst);
+ }
+ }
+
+ delete pkt->senderState;
+ delete pkt->req;
+ delete pkt;
+}
+
+ComputeUnit*
+ComputeUnitParams::create()
+{
+ return new ComputeUnit(this);
+}
+
+bool
+ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
+{
+ Addr line = pkt->req->getPaddr();
+
+ DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
+ pkt->req->getVaddr(), line);
+
+ assert(pkt->senderState);
+ computeUnit->tlbCycles += curTick();
+
+ // pop off the TLB translation state
+ TheISA::GpuTLB::TranslationState *translation_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ // no PageFaults are permitted for data accesses
+ if (!translation_state->tlbEntry->valid) {
+ DTLBPort::SenderState *sender_state =
+ safe_cast<DTLBPort::SenderState*>(translation_state->saved);
+
+ Wavefront *w M5_VAR_USED =
+ computeUnit->wfList[sender_state->_gpuDynInst->simdId]
+ [sender_state->_gpuDynInst->wfSlotId];
+
+ DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
+ pkt->req->getVaddr());
+ }
+
+ assert(translation_state->tlbEntry->valid);
+
+ // update the hitLevel distribution
+ int hit_level = translation_state->hitLevel;
+ computeUnit->hitsPerTLBLevel[hit_level]++;
+
+ delete translation_state->tlbEntry;
+ assert(!translation_state->ports.size());
+ pkt->senderState = translation_state->saved;
+
+ // for prefetch pkt
+ BaseTLB::Mode TLB_mode = translation_state->tlbMode;
+
+ delete translation_state;
+
+ // use the original sender state to know how to close this transaction
+ DTLBPort::SenderState *sender_state =
+ safe_cast<DTLBPort::SenderState*>(pkt->senderState);
+
+ GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+ int mp_index = sender_state->portIndex;
+ Addr vaddr = pkt->req->getVaddr();
+ gpuDynInst->memStatusVector[line].push_back(mp_index);
+ gpuDynInst->tlbHitLevel[mp_index] = hit_level;
+
+ MemCmd requestCmd;
+
+ if (pkt->cmd == MemCmd::ReadResp) {
+ requestCmd = MemCmd::ReadReq;
+ } else if (pkt->cmd == MemCmd::WriteResp) {
+ requestCmd = MemCmd::WriteReq;
+ } else if (pkt->cmd == MemCmd::SwapResp) {
+ requestCmd = MemCmd::SwapReq;
+ } else {
+ panic("unsupported response to request conversion %s\n",
+ pkt->cmd.toString());
+ }
+
+ if (computeUnit->prefetchDepth) {
+ int simdId = gpuDynInst->simdId;
+ int wfSlotId = gpuDynInst->wfSlotId;
+ Addr last = 0;
+
+ switch(computeUnit->prefetchType) {
+ case Enums::PF_CU:
+ last = computeUnit->lastVaddrCU[mp_index];
+ break;
+ case Enums::PF_PHASE:
+ last = computeUnit->lastVaddrPhase[simdId][mp_index];
+ break;
+ case Enums::PF_WF:
+ last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
+ default:
+ break;
+ }
+
+ DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
+ computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
+
+ int stride = last ? (roundDown(vaddr, TheISA::PageBytes) -
+ roundDown(last, TheISA::PageBytes)) >> TheISA::PageShift
+ : 0;
+
+ DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
+
+ computeUnit->lastVaddrCU[mp_index] = vaddr;
+ computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr;
+ computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
+
+ stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
+ computeUnit->prefetchStride: stride;
+
+ DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
+ computeUnit->cu_id, simdId, wfSlotId, mp_index);
+
+ DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
+
+ // Prefetch Next few pages atomically
+ for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
+ DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
+ vaddr+stride*pf*TheISA::PageBytes);
+
+ if (!stride)
+ break;
+
+ Request *prefetch_req = new Request(0, vaddr + stride * pf *
+ TheISA::PageBytes,
+ sizeof(uint8_t), 0,
+ computeUnit->masterId(),
+ 0, 0, 0);
+
+ PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
+ uint8_t foo = 0;
+ prefetch_pkt->dataStatic(&foo);
+
+ // Because it's atomic operation, only need TLB translation state
+ prefetch_pkt->senderState =
+ new TheISA::GpuTLB::TranslationState(TLB_mode,
+ computeUnit->shader->gpuTc,
+ true);
+
+ // Currently prefetches are zero-latency, hence the sendFunctional
+ sendFunctional(prefetch_pkt);
+
+ /* safe_cast the senderState */
+ TheISA::GpuTLB::TranslationState *tlb_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(
+ prefetch_pkt->senderState);
+
+
+ delete tlb_state->tlbEntry;
+ delete tlb_state;
+ delete prefetch_pkt->req;
+ delete prefetch_pkt;
+ }
+ }
+
+ // First we must convert the response cmd back to a request cmd so that
+ // the request can be sent through the cu's master port
+ PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
+ new_pkt->dataStatic(pkt->getPtr<uint8_t>());
+ delete pkt->senderState;
+ delete pkt;
+
+ // New SenderState for the memory access
+ new_pkt->senderState =
+ new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
+ nullptr);
+
+ // translation is done. Schedule the mem_req_event at the appropriate
+ // cycle to send the timing memory request to ruby
+ ComputeUnit::DataPort::MemReqEvent *mem_req_event =
+ new ComputeUnit::DataPort::MemReqEvent(computeUnit->memPort[mp_index],
+ new_pkt);
+
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
+ computeUnit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
+
+ computeUnit->schedule(mem_req_event, curTick() +
+ computeUnit->req_tick_latency);
+
+ return true;
+}
+
+const char*
+ComputeUnit::DataPort::MemReqEvent::description() const
+{
+ return "ComputeUnit memory request event";
+}
+
+void
+ComputeUnit::DataPort::MemReqEvent::process()
+{
+ SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+ GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+ ComputeUnit *compute_unit M5_VAR_USED = dataPort->computeUnit;
+
+ if (!(dataPort->sendTimingReq(pkt))) {
+ dataPort->retries.push_back(std::make_pair(pkt, gpuDynInst));
+
+ DPRINTF(GPUPort,
+ "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
+ compute_unit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, dataPort->index,
+ pkt->req->getPaddr());
+ } else {
+ DPRINTF(GPUPort,
+ "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
+ compute_unit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, dataPort->index,
+ pkt->req->getPaddr());
+ }
+}
+
+/*
+ * The initial translation request could have been rejected,
+ * if <retries> queue is not Retry sending the translation
+ * request. sendRetry() is called from the peer port whenever
+ * a translation completes.
+ */
+void
+ComputeUnit::DTLBPort::recvReqRetry()
+{
+ int len = retries.size();
+
+ DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
+ computeUnit->cu_id, len);
+
+ assert(len > 0);
+ assert(isStalled());
+ // recvReqRetry is an indication that the resource on which this
+ // port was stalling on is freed. So, remove the stall first
+ unstallPort();
+
+ for (int i = 0; i < len; ++i) {
+ PacketPtr pkt = retries.front();
+ Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
+ DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
+
+ if (!sendTimingReq(pkt)) {
+ // Stall port
+ stallPort();
+ DPRINTF(GPUTLB, ": failed again\n");
+ break;
+ } else {
+ DPRINTF(GPUTLB, ": successful\n");
+ retries.pop_front();
+ }
+ }
+}
+
+bool
+ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt)
+{
+ Addr line M5_VAR_USED = pkt->req->getPaddr();
+ DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
+ computeUnit->cu_id, pkt->req->getVaddr(), line);
+
+ assert(pkt->senderState);
+
+ // pop off the TLB translation state
+ TheISA::GpuTLB::TranslationState *translation_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ bool success = translation_state->tlbEntry->valid;
+ delete translation_state->tlbEntry;
+ assert(!translation_state->ports.size());
+ pkt->senderState = translation_state->saved;
+ delete translation_state;
+
+ // use the original sender state to know how to close this transaction
+ ITLBPort::SenderState *sender_state =
+ safe_cast<ITLBPort::SenderState*>(pkt->senderState);
+
+ // get the wavefront associated with this translation request
+ Wavefront *wavefront = sender_state->wavefront;
+ delete pkt->senderState;
+
+ if (success) {
+ // pkt is reused in fetch(), don't delete it here. However, we must
+ // reset the command to be a request so that it can be sent through
+ // the cu's master port
+ assert(pkt->cmd == MemCmd::ReadResp);
+ pkt->cmd = MemCmd::ReadReq;
+
+ computeUnit->fetchStage.fetch(pkt, wavefront);
+ } else {
+ if (wavefront->dropFetch) {
+ assert(wavefront->instructionBuffer.empty());
+ wavefront->dropFetch = false;
+ }
+
+ wavefront->pendingFetch = 0;
+ }
+
+ return true;
+}
+
+/*
+ * The initial translation request could have been rejected, if
+ * <retries> queue is not empty. Retry sending the translation
+ * request. sendRetry() is called from the peer port whenever
+ * a translation completes.
+ */
+void
+ComputeUnit::ITLBPort::recvReqRetry()
+{
+
+ int len = retries.size();
+ DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
+
+ assert(len > 0);
+ assert(isStalled());
+
+ // recvReqRetry is an indication that the resource on which this
+ // port was stalling on is freed. So, remove the stall first
+ unstallPort();
+
+ for (int i = 0; i < len; ++i) {
+ PacketPtr pkt = retries.front();
+ Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
+ DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
+
+ if (!sendTimingReq(pkt)) {
+ stallPort(); // Stall port
+ DPRINTF(GPUTLB, ": failed again\n");
+ break;
+ } else {
+ DPRINTF(GPUTLB, ": successful\n");
+ retries.pop_front();
+ }
+ }
+}
+
+void
+ComputeUnit::regStats()
+{
+ tlbCycles
+ .name(name() + ".tlb_cycles")
+ .desc("total number of cycles for all uncoalesced requests")
+ ;
+
+ tlbRequests
+ .name(name() + ".tlb_requests")
+ .desc("number of uncoalesced requests")
+ ;
+
+ tlbLatency
+ .name(name() + ".avg_translation_latency")
+ .desc("Avg. translation latency for data translations")
+ ;
+
+ tlbLatency = tlbCycles / tlbRequests;
+
+ hitsPerTLBLevel
+ .init(4)
+ .name(name() + ".TLB_hits_distribution")
+ .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
+ ;
+
+ // fixed number of TLB levels
+ for (int i = 0; i < 4; ++i) {
+ if (!i)
+ hitsPerTLBLevel.subname(i,"page_table");
+ else
+ hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
+ }
+
+ execRateDist
+ .init(0, 10, 2)
+ .name(name() + ".inst_exec_rate")
+ .desc("Instruction Execution Rate: Number of executed vector "
+ "instructions per cycle")
+ ;
+
+ ldsBankConflictDist
+ .init(0, VSZ, 2)
+ .name(name() + ".lds_bank_conflicts")
+ .desc("Number of bank conflicts per LDS memory packet")
+ ;
+
+ ldsBankAccesses
+ .name(name() + ".lds_bank_access_cnt")
+ .desc("Total number of LDS bank accesses")
+ ;
+
+ pageDivergenceDist
+ // A wavefront can touch 1 to VSZ pages per memory instruction.
+ // The number of pages per bin can be configured (here it's 4).
+ .init(1, VSZ, 4)
+ .name(name() + ".page_divergence_dist")
+ .desc("pages touched per wf (over all mem. instr.)")
+ ;
+
+ controlFlowDivergenceDist
+ .init(1, VSZ, 4)
+ .name(name() + ".warp_execution_dist")
+ .desc("number of lanes active per instruction (oval all instructions)")
+ ;
+
+ activeLanesPerGMemInstrDist
+ .init(1, VSZ, 4)
+ .name(name() + ".gmem_lanes_execution_dist")
+ .desc("number of active lanes per global memory instruction")
+ ;
+
+ activeLanesPerLMemInstrDist
+ .init(1, VSZ, 4)
+ .name(name() + ".lmem_lanes_execution_dist")
+ .desc("number of active lanes per local memory instruction")
+ ;
+
+ numInstrExecuted
+ .name(name() + ".num_instr_executed")
+ .desc("number of instructions executed")
+ ;
+
+ numVecOpsExecuted
+ .name(name() + ".num_vec_ops_executed")
+ .desc("number of vec ops executed (e.g. VSZ/inst)")
+ ;
+
+ totalCycles
+ .name(name() + ".num_total_cycles")
+ .desc("number of cycles the CU ran for")
+ ;
+
+ ipc
+ .name(name() + ".ipc")
+ .desc("Instructions per cycle (this CU only)")
+ ;
+
+ vpc
+ .name(name() + ".vpc")
+ .desc("Vector Operations per cycle (this CU only)")
+ ;
+
+ numALUInstsExecuted
+ .name(name() + ".num_alu_insts_executed")
+ .desc("Number of dynamic non-GM memory insts executed")
+ ;
+
+ wgBlockedDueLdsAllocation
+ .name(name() + ".wg_blocked_due_lds_alloc")
+ .desc("Workgroup blocked due to LDS capacity")
+ ;
+
+ ipc = numInstrExecuted / totalCycles;
+ vpc = numVecOpsExecuted / totalCycles;
+
+ numTimesWgBlockedDueVgprAlloc
+ .name(name() + ".times_wg_blocked_due_vgpr_alloc")
+ .desc("Number of times WGs are blocked due to VGPR allocation per SIMD")
+ ;
+
+ dynamicGMemInstrCnt
+ .name(name() + ".global_mem_instr_cnt")
+ .desc("dynamic global memory instructions count")
+ ;
+
+ dynamicLMemInstrCnt
+ .name(name() + ".local_mem_instr_cnt")
+ .desc("dynamic local memory intruction count")
+ ;
+
+ numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
+ dynamicLMemInstrCnt;
+
+ completedWfs
+ .name(name() + ".num_completed_wfs")
+ .desc("number of completed wavefronts")
+ ;
+
+ numCASOps
+ .name(name() + ".num_CAS_ops")
+ .desc("number of compare and swap operations")
+ ;
+
+ numFailedCASOps
+ .name(name() + ".num_failed_CAS_ops")
+ .desc("number of compare and swap operations that failed")
+ ;
+
+ // register stats of pipeline stages
+ fetchStage.regStats();
+ scoreboardCheckStage.regStats();
+ scheduleStage.regStats();
+ execStage.regStats();
+
+ // register stats of memory pipeline
+ globalMemoryPipe.regStats();
+ localMemoryPipe.regStats();
+}
+
+void
+ComputeUnit::updatePageDivergenceDist(Addr addr)
+{
+ Addr virt_page_addr = roundDown(addr, TheISA::PageBytes);
+
+ if (!pagesTouched.count(virt_page_addr))
+ pagesTouched[virt_page_addr] = 1;
+ else
+ pagesTouched[virt_page_addr]++;
+}
+
+void
+ComputeUnit::CUExitCallback::process()
+{
+ if (computeUnit->countPages) {
+ std::ostream *page_stat_file =
+ simout.create(computeUnit->name().c_str());
+
+ *page_stat_file << "page, wavefront accesses, workitem accesses" <<
+ std::endl;
+
+ for (auto iter : computeUnit->pageAccesses) {
+ *page_stat_file << std::hex << iter.first << ",";
+ *page_stat_file << std::dec << iter.second.first << ",";
+ *page_stat_file << std::dec << iter.second.second << std::endl;
+ }
+ }
+ }
+
+bool
+ComputeUnit::isDone() const
+{
+ for (int i = 0; i < numSIMDs; ++i) {
+ if (!isSimdDone(i)) {
+ return false;
+ }
+ }
+
+ bool glbMemBusRdy = true;
+ for (int j = 0; j < numGlbMemUnits; ++j) {
+ glbMemBusRdy &= vrfToGlobalMemPipeBus[j].rdy();
+ }
+ bool locMemBusRdy = true;
+ for (int j = 0; j < numLocMemUnits; ++j) {
+ locMemBusRdy &= vrfToLocalMemPipeBus[j].rdy();
+ }
+
+ if (!globalMemoryPipe.isGMLdRespFIFOWrRdy() ||
+ !globalMemoryPipe.isGMStRespFIFOWrRdy() ||
+ !globalMemoryPipe.isGMReqFIFOWrRdy() || !localMemoryPipe.isLMReqFIFOWrRdy()
+ || !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() ||
+ !glbMemToVrfBus.rdy() || !locMemBusRdy || !glbMemBusRdy) {
+ return false;
+ }
+
+ return true;
+}
+
+int32_t
+ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
+{
+ return lds.getRefCounter(dispatchId, wgId);
+}
+
+bool
+ComputeUnit::isSimdDone(uint32_t simdId) const
+{
+ assert(simdId < numSIMDs);
+
+ for (int i=0; i < numGlbMemUnits; ++i) {
+ if (!vrfToGlobalMemPipeBus[i].rdy())
+ return false;
+ }
+ for (int i=0; i < numLocMemUnits; ++i) {
+ if (!vrfToLocalMemPipeBus[i].rdy())
+ return false;
+ }
+ if (!aluPipe[simdId].rdy()) {
+ return false;
+ }
+
+ for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
+ if (wfList[simdId][i_wf]->status != Wavefront::S_STOPPED) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * send a general request to the LDS
+ * make sure to look at the return value here as your request might be
+ * NACK'd and returning false means that you have to have some backup plan
+ */
+bool
+ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst)
+{
+ // this is just a request to carry the GPUDynInstPtr
+ // back and forth
+ Request *newRequest = new Request();
+ newRequest->setPaddr(0x0);
+
+ // ReadReq is not evaluted by the LDS but the Packet ctor requires this
+ PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
+
+ // This is the SenderState needed upon return
+ newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
+
+ return ldsPort->sendTimingReq(newPacket);
+}
+
+/**
+ * get the result of packets sent to the LDS when they return
+ */
+bool
+ComputeUnit::LDSPort::recvTimingResp(PacketPtr packet)
+{
+ const ComputeUnit::LDSPort::SenderState *senderState =
+ dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
+
+ fatal_if(!senderState, "did not get the right sort of sender state");
+
+ GPUDynInstPtr gpuDynInst = senderState->getMemInst();
+
+ delete packet->senderState;
+ delete packet->req;
+ delete packet;
+
+ computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
+ return true;
+}
+
+/**
+ * attempt to send this packet, either the port is already stalled, the request
+ * is nack'd and must stall or the request goes through
+ * when a request cannot be sent, add it to the retries queue
+ */
+bool
+ComputeUnit::LDSPort::sendTimingReq(PacketPtr pkt)
+{
+ ComputeUnit::LDSPort::SenderState *sender_state =
+ dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
+ fatal_if(!sender_state, "packet without a valid sender state");
+
+ GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst();
+
+ if (isStalled()) {
+ fatal_if(retries.empty(), "must have retries waiting to be stalled");
+
+ retries.push(pkt);
+
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
+ computeUnit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId);
+ return false;
+ } else if (!MasterPort::sendTimingReq(pkt)) {
+ // need to stall the LDS port until a recvReqRetry() is received
+ // this indicates that there is more space
+ stallPort();
+ retries.push(pkt);
+
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
+ computeUnit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, pkt->req->getPaddr());
+ return false;
+ } else {
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
+ computeUnit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, pkt->req->getPaddr());
+ return true;
+ }
+}
+
+/**
+ * the bus is telling the port that there is now space so retrying stalled
+ * requests should work now
+ * this allows the port to have a request be nack'd and then have the receiver
+ * say when there is space, rather than simply retrying the send every cycle
+ */
+void
+ComputeUnit::LDSPort::recvReqRetry()
+{
+ auto queueSize = retries.size();
+
+ DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
+ computeUnit->cu_id, queueSize);
+
+ fatal_if(queueSize < 1,
+ "why was there a recvReqRetry() with no pending reqs?");
+ fatal_if(!isStalled(),
+ "recvReqRetry() happened when the port was not stalled");
+
+ unstallPort();
+
+ while (!retries.empty()) {
+ PacketPtr packet = retries.front();
+
+ DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
+
+ if (!MasterPort::sendTimingReq(packet)) {
+ // Stall port
+ stallPort();
+ DPRINTF(GPUPort, ": LDS send failed again\n");
+ break;
+ } else {
+ DPRINTF(GPUTLB, ": LDS send successful\n");
+ retries.pop();
+ }
+ }
+}
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
new file mode 100644
index 000000000..f47c27a0a
--- /dev/null
+++ b/src/gpu-compute/compute_unit.hh
@@ -0,0 +1,767 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Anthony Gutierrez
+ */
+
+#ifndef __COMPUTE_UNIT_HH__
+#define __COMPUTE_UNIT_HH__
+
+#include <deque>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#include "base/callback.hh"
+#include "base/statistics.hh"
+#include "base/types.hh"
+#include "enums/PrefetchType.hh"
+#include "gpu-compute/exec_stage.hh"
+#include "gpu-compute/fetch_stage.hh"
+#include "gpu-compute/global_memory_pipeline.hh"
+#include "gpu-compute/local_memory_pipeline.hh"
+#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/schedule_stage.hh"
+#include "gpu-compute/scoreboard_check_stage.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+
+static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
+static const int MAX_WIDTH_FOR_MEM_INST = 32;
+
+class NDRange;
+class Shader;
+class VectorRegisterFile;
+
+struct ComputeUnitParams;
+
+enum EXEC_POLICY
+{
+ OLDEST = 0,
+ RR
+};
+
+// List of execution units
+enum EXEC_UNIT
+{
+ SIMD0 = 0,
+ SIMD1,
+ SIMD2,
+ SIMD3,
+ GLBMEM_PIPE,
+ LDSMEM_PIPE,
+ NUM_UNITS
+};
+
+enum TLB_CACHE
+{
+ TLB_MISS_CACHE_MISS = 0,
+ TLB_MISS_CACHE_HIT,
+ TLB_HIT_CACHE_MISS,
+ TLB_HIT_CACHE_HIT
+};
+
+class ComputeUnit : public MemObject
+{
+ public:
+ FetchStage fetchStage;
+ ScoreboardCheckStage scoreboardCheckStage;
+ ScheduleStage scheduleStage;
+ ExecStage execStage;
+ GlobalMemPipeline globalMemoryPipe;
+ LocalMemPipeline localMemoryPipe;
+
+ // Buffers used to communicate between various pipeline stages
+
+ // List of waves which are ready to be scheduled.
+ // Each execution resource has a ready list. readyList is
+ // used to communicate between scoreboardCheck stage and
+ // schedule stage
+ // TODO: make enum to index readyList
+ std::vector<std::vector<Wavefront*>> readyList;
+
+ // Stores the status of waves. A READY implies the
+ // wave is ready to be scheduled this cycle and
+ // is already present in the readyList. waveStatusList is
+ // used to communicate between scoreboardCheck stage and
+ // schedule stage
+ // TODO: convert std::pair to a class to increase readability
+ std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
+
+ // List of waves which will be dispatched to
+ // each execution resource. A FILLED implies
+ // dispatch list is non-empty and
+ // execution unit has something to execute
+ // this cycle. Currently, the dispatch list of
+ // an execution resource can hold only one wave because
+ // an execution resource can execute only one wave in a cycle.
+ // dispatchList is used to communicate between schedule
+ // and exec stage
+ // TODO: convert std::pair to a class to increase readability
+ std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
+
+ int rrNextMemID; // used by RR WF exec policy to cycle through WF's
+ int rrNextALUWp;
+ typedef ComputeUnitParams Params;
+ std::vector<std::vector<Wavefront*>> wfList;
+ int cu_id;
+
+ // array of vector register files, one per SIMD
+ std::vector<VectorRegisterFile*> vrf;
+ // Number of vector ALU units (SIMDs) in CU
+ int numSIMDs;
+ // number of pipe stages for bypassing data to next dependent single
+ // precision vector instruction inside the vector ALU pipeline
+ int spBypassPipeLength;
+ // number of pipe stages for bypassing data to next dependent double
+ // precision vector instruction inside the vector ALU pipeline
+ int dpBypassPipeLength;
+ // number of cycles per issue period
+ int issuePeriod;
+
+ // Number of global and local memory execution resources in CU
+ int numGlbMemUnits;
+ int numLocMemUnits;
+ // tracks the last cycle a vector instruction was executed on a SIMD
+ std::vector<uint64_t> lastExecCycle;
+
+ // true if we allow a separate TLB per lane
+ bool perLaneTLB;
+ // if 0, TLB prefetching is off.
+ int prefetchDepth;
+ // if fixed-stride prefetching, this is the stride.
+ int prefetchStride;
+
+ class LastVaddrWave
+ {
+ public:
+ Addr vaddrs[VSZ];
+ Addr& operator[](int idx) {
+ return vaddrs[idx];
+ }
+
+ LastVaddrWave() {
+ for (int i = 0; i < VSZ; ++i)
+ vaddrs[i] = 0;
+ }
+ };
+
+ LastVaddrWave lastVaddrCU;
+ std::vector<LastVaddrWave> lastVaddrPhase;
+ std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
+ Enums::PrefetchType prefetchType;
+ EXEC_POLICY exec_policy;
+
+ bool xact_cas_mode;
+ bool debugSegFault;
+ bool functionalTLB;
+ bool localMemBarrier;
+
+ /*
+ * for Counting page accesses
+ *
+ * cuExitCallback inherits from Callback. When you register a callback
+ * function as an exit callback, it will get added to an exit callback
+ * queue, such that on simulation exit, all callbacks in the callback
+ * queue will have their process() function called.
+ */
+ bool countPages;
+
+ Shader *shader;
+ uint32_t barrier_id;
+ // vector of Vector ALU (MACC) pipelines
+ std::vector<WaitClass> aluPipe;
+ // minimum issue period per SIMD unit (in cycles)
+ std::vector<WaitClass> wfWait;
+
+ // Resource control for Vector Register File->Global Memory pipe buses
+ std::vector<WaitClass> vrfToGlobalMemPipeBus;
+ // Resource control for Vector Register File->Local Memory pipe buses
+ std::vector<WaitClass> vrfToLocalMemPipeBus;
+ int nextGlbMemBus;
+ int nextLocMemBus;
+ // Resource control for global memory to VRF data/address bus
+ WaitClass glbMemToVrfBus;
+ // Resource control for local memory to VRF data/address bus
+ WaitClass locMemToVrfBus;
+
+ uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
+ uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
+ uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store
+ uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load
+
+ Tick req_tick_latency;
+ Tick resp_tick_latency;
+
+ // number of vector registers being reserved for each SIMD unit
+ std::vector<int> vectorRegsReserved;
+ // number of vector registers per SIMD unit
+ uint32_t numVecRegsPerSimd;
+ // Support for scheduling VGPR status update events
+ std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
+ std::vector<uint64_t> timestampVec;
+ std::vector<uint8_t> statusVec;
+
+ void
+ registerEvent(uint32_t simdId,
+ uint32_t regIdx,
+ uint32_t operandSize,
+ uint64_t when,
+ uint8_t newStatus) {
+ regIdxVec.push_back(std::make_pair(simdId, regIdx));
+ timestampVec.push_back(when);
+ statusVec.push_back(newStatus);
+ if (operandSize > 4) {
+ regIdxVec.push_back(std::make_pair(simdId,
+ ((regIdx + 1) %
+ numVecRegsPerSimd)));
+ timestampVec.push_back(when);
+ statusVec.push_back(newStatus);
+ }
+ }
+
+ void updateEvents();
+
+ // this hash map will keep track of page divergence
+ // per memory instruction per wavefront. The hash map
+ // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
+ std::map<Addr, int> pagesTouched;
+
+ ComputeUnit(const Params *p);
+ ~ComputeUnit();
+ int spBypassLength() { return spBypassPipeLength; };
+ int dpBypassLength() { return dpBypassPipeLength; };
+ int storeBusLength() { return numCyclesPerStoreTransfer; };
+ int loadBusLength() { return numCyclesPerLoadTransfer; };
+ int wfSize() const { return wavefrontSize; };
+
+ void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+ void exec();
+ void initiateFetch(Wavefront *wavefront);
+ void fetch(PacketPtr pkt, Wavefront *wavefront);
+ void FillKernelState(Wavefront *w, NDRange *ndr);
+
+ void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
+ int trueWgSizeTotal);
+
+ void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
+ int trueWgSize[], int trueWgSizeTotal,
+ LdsChunk *ldsChunk, uint64_t origSpillMemStart);
+
+ void StartWorkgroup(NDRange *ndr);
+ int ReadyWorkgroup(NDRange *ndr);
+
+ bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
+ bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
+ bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
+ int GlbMemUnitId() { return GLBMEM_PIPE; }
+ int ShrMemUnitId() { return LDSMEM_PIPE; }
+ int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
+ int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
+ /* This function cycles through all the wavefronts in all the phases to see
+ * if all of the wavefronts which should be associated with one barrier
+ * (denoted with _barrier_id), are all at the same barrier in the program
+ * (denoted by bcnt). When the number at the barrier matches bslots, then
+ * return true.
+ */
+ int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
+ bool cedeSIMD(int simdId, int wfSlotId);
+
+ template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
+ virtual void init();
+ void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
+ void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
+ void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
+ bool kernelLaunch=true,
+ RequestPtr req=nullptr);
+ void handleMemPacket(PacketPtr pkt, int memport_index);
+ bool processTimingPacket(PacketPtr pkt);
+ void processFetchReturn(PacketPtr pkt);
+ void updatePageDivergenceDist(Addr addr);
+
+ MasterID masterId() { return _masterId; }
+
+ bool isDone() const;
+ bool isSimdDone(uint32_t) const;
+
+ protected:
+ MasterID _masterId;
+
+ LdsState &lds;
+
+ public:
+ // the following stats compute the avg. TLB accesslatency per
+ // uncoalesced request (only for data)
+ Stats::Scalar tlbRequests;
+ Stats::Scalar tlbCycles;
+ Stats::Formula tlbLatency;
+ // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
+ Stats::Vector hitsPerTLBLevel;
+
+ Stats::Scalar ldsBankAccesses;
+ Stats::Distribution ldsBankConflictDist;
+
+ // over all memory instructions executed over all wavefronts
+ // how many touched 0-4 pages, 4-8, ..., 60-64 pages
+ Stats::Distribution pageDivergenceDist;
+ Stats::Scalar dynamicGMemInstrCnt;
+ Stats::Scalar dynamicLMemInstrCnt;
+
+ Stats::Scalar wgBlockedDueLdsAllocation;
+ // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
+ // when the instruction is committed, this number is still incremented by 1
+ Stats::Scalar numInstrExecuted;
+ // Number of cycles among successive instruction executions across all
+ // wavefronts of the same CU
+ Stats::Distribution execRateDist;
+ // number of individual vector operations executed
+ Stats::Scalar numVecOpsExecuted;
+ // Total cycles that something is running on the GPU
+ Stats::Scalar totalCycles;
+ Stats::Formula vpc; // vector ops per cycle
+ Stats::Formula ipc; // vector instructions per cycle
+ Stats::Distribution controlFlowDivergenceDist;
+ Stats::Distribution activeLanesPerGMemInstrDist;
+ Stats::Distribution activeLanesPerLMemInstrDist;
+ // number of vector ALU instructions received
+ Stats::Formula numALUInstsExecuted;
+ // number of times a WG can not start due to lack of free VGPRs in SIMDs
+ Stats::Scalar numTimesWgBlockedDueVgprAlloc;
+ Stats::Scalar numCASOps;
+ Stats::Scalar numFailedCASOps;
+ Stats::Scalar completedWfs;
+ // flag per vector SIMD unit that is set when there is at least one
+ // WV that has a vector ALU instruction as the oldest in its
+ // Instruction Buffer: Defined in the Scoreboard stage, consumed
+ // by the Execute stage.
+ std::vector<bool> vectorAluInstAvail;
+ // number of available (oldest) LDS instructions that could have
+ // been issued to the LDS at a specific issue slot
+ int shrMemInstAvail;
+ // number of available Global memory instructions that could have
+ // been issued to TCP at a specific issue slot
+ int glbMemInstAvail;
+
+ void
+ regStats();
+
+ LdsState &
+ getLds() const
+ {
+ return lds;
+ }
+
+ int32_t
+ getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
+
+ bool
+ sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
+
+ typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
+ pageDataStruct pageAccesses;
+
+ class CUExitCallback : public Callback
+ {
+ private:
+ ComputeUnit *computeUnit;
+
+ public:
+ virtual ~CUExitCallback() { }
+
+ CUExitCallback(ComputeUnit *_cu)
+ {
+ computeUnit = _cu;
+ }
+
+ virtual void
+ process();
+ };
+
+ CUExitCallback *cuExitCallback;
+
+ /** Data access Port **/
+ class DataPort : public MasterPort
+ {
+ public:
+ DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+ : MasterPort(_name, _cu), computeUnit(_cu),
+ index(_index) { }
+
+ bool snoopRangeSent;
+
+ struct SenderState : public Packet::SenderState
+ {
+ GPUDynInstPtr _gpuDynInst;
+ int port_index;
+ Packet::SenderState *saved;
+
+ SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
+ Packet::SenderState *sender_state=nullptr)
+ : _gpuDynInst(gpuDynInst),
+ port_index(_port_index),
+ saved(sender_state) { }
+ };
+
+ class MemReqEvent : public Event
+ {
+ private:
+ DataPort *dataPort;
+ PacketPtr pkt;
+
+ public:
+ MemReqEvent(DataPort *_data_port, PacketPtr _pkt)
+ : Event(), dataPort(_data_port), pkt(_pkt)
+ {
+ setFlags(Event::AutoDelete);
+ }
+
+ void process();
+ const char *description() const;
+ };
+
+ class MemRespEvent : public Event
+ {
+ private:
+ DataPort *dataPort;
+ PacketPtr pkt;
+
+ public:
+ MemRespEvent(DataPort *_data_port, PacketPtr _pkt)
+ : Event(), dataPort(_data_port), pkt(_pkt)
+ {
+ setFlags(Event::AutoDelete);
+ }
+
+ void process();
+ const char *description() const;
+ };
+
+ std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
+
+ protected:
+ ComputeUnit *computeUnit;
+ int index;
+
+ virtual bool recvTimingResp(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt) { }
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+
+ virtual void
+ getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
+ {
+ resp.clear();
+ snoop = true;
+ }
+
+ };
+
+ // Instruction cache access port
+ class SQCPort : public MasterPort
+ {
+ public:
+ SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+ : MasterPort(_name, _cu), computeUnit(_cu),
+ index(_index) { }
+
+ bool snoopRangeSent;
+
+ struct SenderState : public Packet::SenderState
+ {
+ Wavefront *wavefront;
+ Packet::SenderState *saved;
+
+ SenderState(Wavefront *_wavefront, Packet::SenderState
+ *sender_state=nullptr)
+ : wavefront(_wavefront), saved(sender_state) { }
+ };
+
+ std::deque<std::pair<PacketPtr, Wavefront*>> retries;
+
+ protected:
+ ComputeUnit *computeUnit;
+ int index;
+
+ virtual bool recvTimingResp(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt) { }
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+
+ virtual void
+ getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
+ {
+ resp.clear();
+ snoop = true;
+ }
+ };
+
+ /** Data TLB port **/
+ class DTLBPort : public MasterPort
+ {
+ public:
+ DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+ : MasterPort(_name, _cu), computeUnit(_cu),
+ index(_index), stalled(false)
+ { }
+
+ bool isStalled() { return stalled; }
+ void stallPort() { stalled = true; }
+ void unstallPort() { stalled = false; }
+
+ /**
+ * here we queue all the translation requests that were
+ * not successfully sent.
+ */
+ std::deque<PacketPtr> retries;
+
+ /** SenderState is information carried along with the packet
+ * throughout the TLB hierarchy
+ */
+ struct SenderState: public Packet::SenderState
+ {
+ // the memInst that this is associated with
+ GPUDynInstPtr _gpuDynInst;
+
+ // the lane in the memInst this is associated with, so we send
+ // the memory request down the right port
+ int portIndex;
+
+ // constructor used for packets involved in timing accesses
+ SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
+ : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
+
+ };
+
+ protected:
+ ComputeUnit *computeUnit;
+ int index;
+ bool stalled;
+
+ virtual bool recvTimingResp(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt) { }
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+ };
+
+ class ITLBPort : public MasterPort
+ {
+ public:
+ ITLBPort(const std::string &_name, ComputeUnit *_cu)
+ : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
+
+
+ bool isStalled() { return stalled; }
+ void stallPort() { stalled = true; }
+ void unstallPort() { stalled = false; }
+
+ /**
+ * here we queue all the translation requests that were
+ * not successfully sent.
+ */
+ std::deque<PacketPtr> retries;
+
+ /** SenderState is information carried along with the packet
+ * throughout the TLB hierarchy
+ */
+ struct SenderState: public Packet::SenderState
+ {
+ // The wavefront associated with this request
+ Wavefront *wavefront;
+
+ SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
+ };
+
+ protected:
+ ComputeUnit *computeUnit;
+ bool stalled;
+
+ virtual bool recvTimingResp(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt) { }
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+ };
+
+ /**
+ * the port intended to communicate between the CU and its LDS
+ */
+ class LDSPort : public MasterPort
+ {
+ public:
+ LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
+ : MasterPort(_name, _cu, _id), computeUnit(_cu)
+ {
+ }
+
+ bool isStalled() const { return stalled; }
+ void stallPort() { stalled = true; }
+ void unstallPort() { stalled = false; }
+
+ /**
+ * here we queue all the requests that were
+ * not successfully sent.
+ */
+ std::queue<PacketPtr> retries;
+
+ /**
+ * SenderState is information carried along with the packet, esp. the
+ * GPUDynInstPtr
+ */
+ class SenderState: public Packet::SenderState
+ {
+ protected:
+ // The actual read/write/atomic request that goes with this command
+ GPUDynInstPtr _gpuDynInst = nullptr;
+
+ public:
+ SenderState(GPUDynInstPtr gpuDynInst):
+ _gpuDynInst(gpuDynInst)
+ {
+ }
+
+ GPUDynInstPtr
+ getMemInst() const
+ {
+ return _gpuDynInst;
+ }
+ };
+
+ virtual bool
+ sendTimingReq(PacketPtr pkt);
+
+ protected:
+
+ bool stalled = false; ///< whether or not it is stalled
+
+ ComputeUnit *computeUnit;
+
+ virtual bool
+ recvTimingResp(PacketPtr pkt);
+
+ virtual Tick
+ recvAtomic(PacketPtr pkt) { return 0; }
+
+ virtual void
+ recvFunctional(PacketPtr pkt)
+ {
+ }
+
+ virtual void
+ recvRangeChange()
+ {
+ }
+
+ virtual void
+ recvReqRetry();
+ };
+
+ /** The port to access the Local Data Store
+ * Can be connected to a LDS object
+ */
+ LDSPort *ldsPort = nullptr;
+
+ LDSPort *
+ getLdsPort() const
+ {
+ return ldsPort;
+ }
+
+ /** The memory port for SIMD data accesses.
+ * Can be connected to PhysMem for Ruby for timing simulations
+ */
+ std::vector<DataPort*> memPort;
+ // port to the TLB hierarchy (i.e., the L1 TLB)
+ std::vector<DTLBPort*> tlbPort;
+ // port to the SQC (i.e. the I-cache)
+ SQCPort *sqcPort;
+ // port to the SQC TLB (there's a separate TLB for each I-cache)
+ ITLBPort *sqcTLBPort;
+
+ virtual BaseMasterPort&
+ getMasterPort(const std::string &if_name, PortID idx)
+ {
+ if (if_name == "memory_port") {
+ memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
+ this, idx);
+ return *memPort[idx];
+ } else if (if_name == "translation_port") {
+ tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
+ this, idx);
+ return *tlbPort[idx];
+ } else if (if_name == "sqc_port") {
+ sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
+ this, idx);
+ return *sqcPort;
+ } else if (if_name == "sqc_tlb_port") {
+ sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
+ return *sqcTLBPort;
+ } else if (if_name == "ldsPort") {
+ if (ldsPort) {
+ fatal("an LDS port was already allocated");
+ }
+ ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
+ return *ldsPort;
+ } else {
+ panic("incorrect port name");
+ }
+ }
+
+ // xact_cas_load()
+ class waveIdentifier
+ {
+ public:
+ waveIdentifier() { }
+ waveIdentifier(int _simdId, int _wfSlotId)
+ : simdId(_simdId), wfSlotId(_wfSlotId) { }
+
+ int simdId;
+ int wfSlotId;
+ };
+
+ class waveQueue
+ {
+ public:
+ std::list<waveIdentifier> waveIDQueue;
+ };
+ std::map<unsigned, waveQueue> xactCasLoadMap;
+
+ uint64_t getAndIncSeqNum() { return globalSeqNum++; }
+
+ private:
+ uint64_t globalSeqNum;
+ int wavefrontSize;
+};
+
+#endif // __COMPUTE_UNIT_HH__
diff --git a/src/gpu-compute/condition_register_state.cc b/src/gpu-compute/condition_register_state.cc
new file mode 100644
index 000000000..f3f2d2927
--- /dev/null
+++ b/src/gpu-compute/condition_register_state.cc
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/condition_register_state.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+
+ConditionRegisterState::ConditionRegisterState()
+{
+ computeUnit = nullptr;
+ c_reg.clear();
+ busy.clear();
+}
+
+void
+ConditionRegisterState::setParent(ComputeUnit *_computeUnit)
+{
+ computeUnit = _computeUnit;
+ _name = computeUnit->name() + ".CondRegState";
+}
+
+void
+ConditionRegisterState::init(uint32_t _size)
+{
+ c_reg.resize(_size);
+ busy.resize(_size, 0);
+}
+
+void
+ConditionRegisterState::exec(GPUStaticInst *ii, Wavefront *w)
+{
+ // iterate over all operands
+ for (auto i = 0; i < ii->getNumOperands(); ++i) {
+ // is this a condition register destination operand?
+ if (ii->isCondRegister(i) && ii->isDstOperand(i)) {
+ // mark the register as busy
+ markReg(ii->getRegisterIndex(i), 1);
+ uint32_t pipeLen = w->computeUnit->spBypassLength();
+
+ // schedule an event for marking the register as ready
+ w->computeUnit->
+ registerEvent(w->simdId, ii->getRegisterIndex(i),
+ ii->getOperandSize(i),
+ w->computeUnit->shader->tick_cnt +
+ w->computeUnit->shader->ticks(pipeLen), 0);
+ }
+ }
+}
diff --git a/src/gpu-compute/condition_register_state.hh b/src/gpu-compute/condition_register_state.hh
new file mode 100644
index 000000000..139874a66
--- /dev/null
+++ b/src/gpu-compute/condition_register_state.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __CONDITION_REGISTER_STATE_HH__
+#define __CONDITION_REGISTER_STATE_HH__
+
+#include <string>
+#include <vector>
+
+#include "gpu-compute/misc.hh"
+
+class ComputeUnit;
+class GPUStaticInst;
+class Shader;
+class Wavefront;
+
+// Condition Register State (used only when executing HSAIL)
+class ConditionRegisterState
+{
+ public:
+ ConditionRegisterState();
+ void init(uint32_t _size);
+ const std::string name() const { return _name; }
+ void setParent(ComputeUnit *_computeUnit);
+ void regStats() { }
+
+ template<typename T>
+ T
+ read(int regIdx, int threadId)
+ {
+ bool tmp = c_reg[regIdx][threadId];
+ T *p0 = (T*)(&tmp);
+
+ return *p0;
+ }
+
+ template<typename T>
+ void
+ write(int regIdx, int threadId, T value)
+ {
+ c_reg[regIdx][threadId] = (bool)(value & 0x01);
+ }
+
+ void
+ markReg(int regIdx, uint8_t value)
+ {
+ busy.at(regIdx) = value;
+ }
+
+ uint8_t
+ regBusy(int idx)
+ {
+ uint8_t status = busy.at(idx);
+ return status;
+ }
+
+ int numRegs() { return c_reg.size(); }
+ void exec(GPUStaticInst *ii, Wavefront *w);
+
+ private:
+ ComputeUnit* computeUnit;
+ std::string _name;
+ // Condition Register state
+ std::vector<VectorMask> c_reg;
+ // flag indicating if a register is busy
+ std::vector<uint8_t> busy;
+};
+
+#endif
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
new file mode 100644
index 000000000..55e4be72a
--- /dev/null
+++ b/src/gpu-compute/dispatcher.cc
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Marc Orr
+ */
+
+
+#include "gpu-compute/dispatcher.hh"
+
+#include "cpu/base.hh"
+#include "debug/GPUDisp.hh"
+#include "gpu-compute/cl_driver.hh"
+#include "gpu-compute/cl_event.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/packet_access.hh"
+
+GpuDispatcher *GpuDispatcher::instance = nullptr;
+
+GpuDispatcher::GpuDispatcher(const Params *p)
+ : DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")),
+ pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
+ dispatchCount(0), dispatchActive(false), cpu(p->cpu),
+ shader(p->shader_pointer), driver(p->cl_driver), tickEvent(this)
+{
+ shader->handshake(this);
+ driver->handshake(this);
+
+ ndRange.wg_disp_rem = false;
+ ndRange.globalWgId = 0;
+
+ schedule(&tickEvent, 0);
+
+ // translation port for the dispatcher
+ tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
+
+ num_kernelLaunched
+ .name(name() + ".num_kernel_launched")
+ .desc("number of kernel launched")
+ ;
+}
+
+GpuDispatcher *GpuDispatcherParams::create()
+{
+ GpuDispatcher *dispatcher = new GpuDispatcher(this);
+ GpuDispatcher::setInstance(dispatcher);
+
+ return GpuDispatcher::getInstance();
+}
+
+void
+GpuDispatcher::serialize(CheckpointOut &cp) const
+{
+ Tick event_tick = 0;
+
+ if (ndRange.wg_disp_rem)
+ fatal("Checkpointing not supported during active workgroup execution");
+
+ if (tickEvent.scheduled())
+ event_tick = tickEvent.when();
+
+ SERIALIZE_SCALAR(event_tick);
+
+}
+
+void
+GpuDispatcher::unserialize(CheckpointIn &cp)
+{
+ Tick event_tick;
+
+ if (tickEvent.scheduled())
+ deschedule(&tickEvent);
+
+ UNSERIALIZE_SCALAR(event_tick);
+
+ if (event_tick)
+ schedule(&tickEvent, event_tick);
+}
+
+AddrRangeList
+GpuDispatcher::getAddrRanges() const
+{
+ AddrRangeList ranges;
+
+ DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
+ pioAddr, pioSize);
+
+ ranges.push_back(RangeSize(pioAddr, pioSize));
+
+ return ranges;
+}
+
+Tick
+GpuDispatcher::read(PacketPtr pkt)
+{
+ assert(pkt->getAddr() >= pioAddr);
+ assert(pkt->getAddr() < pioAddr + pioSize);
+
+ int offset = pkt->getAddr() - pioAddr;
+ pkt->allocate();
+
+ DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
+
+ if (offset < 8) {
+ assert(!offset);
+ assert(pkt->getSize() == 8);
+
+ uint64_t retval = dispatchActive;
+ pkt->set(retval);
+ } else {
+ offset -= 8;
+ assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
+ char *curTaskPtr = (char*)&curTask;
+
+ memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
+ }
+
+ pkt->makeAtomicResponse();
+
+ return pioDelay;
+}
+
+Tick
+GpuDispatcher::write(PacketPtr pkt)
+{
+ assert(pkt->getAddr() >= pioAddr);
+ assert(pkt->getAddr() < pioAddr + pioSize);
+
+ int offset = pkt->getAddr() - pioAddr;
+
+#if TRACING_ON
+ uint64_t data_val = 0;
+
+ switch (pkt->getSize()) {
+ case 1:
+ data_val = pkt->get<uint8_t>();
+ break;
+ case 2:
+ data_val = pkt->get<uint16_t>();
+ break;
+ case 4:
+ data_val = pkt->get<uint32_t>();
+ break;
+ case 8:
+ data_val = pkt->get<uint64_t>();
+ break;
+ default:
+ DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
+ }
+
+ DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
+ pkt->getSize());
+#endif
+ if (!offset) {
+ static int nextId = 0;
+
+ // The depends field of the qstruct, which was previously unused, is
+ // used to communicate with simulated application.
+ if (curTask.depends) {
+ HostState hs;
+ shader->ReadMem((uint64_t)(curTask.depends), &hs,
+ sizeof(HostState), 0);
+
+ // update event start time (in nano-seconds)
+ uint64_t start = curTick() / 1000;
+
+ shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
+ &start, sizeof(uint64_t), 0);
+ }
+
+ // launch kernel
+ ++num_kernelLaunched;
+
+ NDRange *ndr = &(ndRangeMap[nextId]);
+ // copy dispatch info
+ ndr->q = curTask;
+
+ // update the numDispTask polled by the runtime
+ accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
+
+ ndr->numWgTotal = 1;
+
+ for (int i = 0; i < 3; ++i) {
+ ndr->wgId[i] = 0;
+ ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
+ ndr->numWgTotal *= ndr->numWg[i];
+ }
+
+ ndr->numWgCompleted = 0;
+ ndr->globalWgId = 0;
+ ndr->wg_disp_rem = true;
+ ndr->execDone = false;
+ ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
+ ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
+ ndr->dispatchId = nextId;
+ ndr->curTid = pkt->req->threadId();
+ DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
+ execIds.push(nextId);
+ ++nextId;
+
+ dispatchActive = true;
+
+ if (!tickEvent.scheduled()) {
+ schedule(&tickEvent, curTick() + shader->ticks(1));
+ }
+ } else {
+ // populate current task struct
+ // first 64 bits are launch reg
+ offset -= 8;
+ assert(offset < sizeof(HsaQueueEntry));
+ char *curTaskPtr = (char*)&curTask;
+ memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
+ }
+
+ pkt->makeAtomicResponse();
+
+ return pioDelay;
+}
+
+
+BaseMasterPort&
+GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx)
+{
+ if (if_name == "translation_port") {
+ return *tlbPort;
+ }
+
+ return DmaDevice::getMasterPort(if_name, idx);
+}
+
+void
+GpuDispatcher::exec()
+{
+ int fail_count = 0;
+
+ // There are potentially multiple outstanding kernel launches.
+ // It is possible that the workgroups in a different kernel
+ // can fit on the GPU even if another kernel's workgroups cannot
+ DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
+
+ while (execIds.size() > fail_count) {
+ int execId = execIds.front();
+
+ while (ndRangeMap[execId].wg_disp_rem) {
+ //update the thread context
+ shader->updateThreadContext(ndRangeMap[execId].curTid);
+
+ // attempt to dispatch_workgroup
+ if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
+ // if we failed try the next kernel,
+ // it may have smaller workgroups.
+ // put it on the queue to rety latter
+ DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
+ execIds.push(execId);
+ ++fail_count;
+ break;
+ }
+ }
+ // let's try the next kernel_id
+ execIds.pop();
+ }
+
+ DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
+
+ if (doneIds.size() && cpu) {
+ shader->hostWakeUp(cpu);
+ }
+
+ while (doneIds.size()) {
+ // wakeup the CPU if any Kernels completed this cycle
+ DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
+ doneIds.pop();
+ }
+}
+
+void
+GpuDispatcher::notifyWgCompl(Wavefront *w)
+{
+ int kern_id = w->kern_id;
+ DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
+ assert(ndRangeMap[kern_id].dispatchId == kern_id);
+ ndRangeMap[kern_id].numWgCompleted++;
+
+ if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
+ ndRangeMap[kern_id].execDone = true;
+ doneIds.push(kern_id);
+
+ if (ndRangeMap[kern_id].addrToNotify) {
+ accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
+ 0);
+ }
+
+ accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
+
+ // update event end time (in nano-seconds)
+ if (ndRangeMap[kern_id].q.depends) {
+ HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
+ uint64_t event;
+ shader->ReadMem((uint64_t)(&host_state->event), &event,
+ sizeof(uint64_t), 0);
+
+ uint64_t end = curTick() / 1000;
+
+ shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
+ sizeof(uint64_t), 0);
+ }
+ }
+
+ if (!tickEvent.scheduled()) {
+ schedule(&tickEvent, curTick() + shader->ticks(1));
+ }
+}
+
+void
+GpuDispatcher::scheduleDispatch()
+{
+ if (!tickEvent.scheduled())
+ schedule(&tickEvent, curTick() + shader->ticks(1));
+}
+
+void
+GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
+{
+ if (cpu) {
+ if (off) {
+ shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
+ true);
+ val += off;
+ }
+
+ shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
+ } else {
+ panic("Cannot find host");
+ }
+}
+
+GpuDispatcher::TickEvent::TickEvent(GpuDispatcher *_dispatcher)
+ : Event(CPU_Tick_Pri), dispatcher(_dispatcher)
+{
+}
+
+void
+GpuDispatcher::TickEvent::process()
+{
+ dispatcher->exec();
+}
+
+const char*
+GpuDispatcher::TickEvent::description() const
+{
+ return "GPU Dispatcher tick";
+}
+
+// helper functions for driver to retrieve GPU attributes
+int
+GpuDispatcher::getNumCUs()
+{
+ return shader->cuList.size();
+}
+
+void
+GpuDispatcher::setFuncargsSize(int funcargs_size)
+{
+ shader->funcargs_size = funcargs_size;
+}
diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh
new file mode 100644
index 000000000..76f932655
--- /dev/null
+++ b/src/gpu-compute/dispatcher.hh
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Marc Orr
+ */
+
+#ifndef __GPU_DISPATCHER_HH__
+#define __GPU_DISPATCHER_HH__
+
+#include <queue>
+#include <vector>
+
+#include "base/statistics.hh"
+#include "dev/dma_device.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/ndrange.hh"
+#include "gpu-compute/qstruct.hh"
+#include "mem/port.hh"
+#include "params/GpuDispatcher.hh"
+
+class BaseCPU;
+class Shader;
+
+class GpuDispatcher : public DmaDevice
+{
+ public:
+ typedef GpuDispatcherParams Params;
+
+ class TickEvent : public Event
+ {
+ private:
+ GpuDispatcher *dispatcher;
+
+ public:
+ TickEvent(GpuDispatcher *);
+ void process();
+ const char *description() const;
+ };
+
+ MasterID masterId() { return _masterId; }
+
+ protected:
+ MasterID _masterId;
+
+ // Base and length of PIO register space
+ Addr pioAddr;
+ Addr pioSize;
+ Tick pioDelay;
+
+ HsaQueueEntry curTask;
+
+ std::unordered_map<int, NDRange> ndRangeMap;
+ NDRange ndRange;
+
+ // list of kernel_ids to launch
+ std::queue<int> execIds;
+ // list of kernel_ids that have finished
+ std::queue<int> doneIds;
+
+ uint64_t dispatchCount;
+ // is there a kernel in execution?
+ bool dispatchActive;
+
+ BaseCPU *cpu;
+ Shader *shader;
+ ClDriver *driver;
+ TickEvent tickEvent;
+
+ static GpuDispatcher *instance;
+
+ // sycall emulation mode can have only 1 application running(?)
+ // else we have to do some pid based tagging
+ // unused
+ typedef std::unordered_map<uint64_t, uint64_t> TranslationBuffer;
+ TranslationBuffer tlb;
+
+ public:
+ /*statistics*/
+ Stats::Scalar num_kernelLaunched;
+ GpuDispatcher(const Params *p);
+
+ ~GpuDispatcher() { }
+
+ void exec();
+ virtual void serialize(CheckpointOut &cp) const;
+ virtual void unserialize(CheckpointIn &cp);
+ void notifyWgCompl(Wavefront *w);
+ void scheduleDispatch();
+ void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off);
+
+ // using singleton so that glue code can pass pointer locations
+ // to the dispatcher. when there are multiple dispatchers, we can
+ // call something like getInstance(index)
+ static void
+ setInstance(GpuDispatcher *_instance)
+ {
+ instance = _instance;
+ }
+
+ static GpuDispatcher* getInstance() { return instance; }
+
+ class TLBPort : public MasterPort
+ {
+ public:
+
+ TLBPort(const std::string &_name, GpuDispatcher *_dispatcher)
+ : MasterPort(_name, _dispatcher), dispatcher(_dispatcher) { }
+
+ protected:
+ GpuDispatcher *dispatcher;
+
+ virtual bool recvTimingResp(PacketPtr pkt) { return true; }
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt) { }
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry() { }
+
+ };
+
+ TLBPort *tlbPort;
+
+ virtual BaseMasterPort& getMasterPort(const std::string &if_name,
+ PortID idx);
+
+ AddrRangeList getAddrRanges() const;
+ Tick read(PacketPtr pkt);
+ Tick write(PacketPtr pkt);
+
+ // helper functions to retrieve/set GPU attributes
+ int getNumCUs();
+ void setFuncargsSize(int funcargs_size);
+};
+
+#endif // __GPU_DISPATCHER_HH__
diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc
new file mode 100644
index 000000000..c2b95f85e
--- /dev/null
+++ b/src/gpu-compute/exec_stage.cc
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#include "gpu-compute/exec_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/wavefront.hh"
+
+ExecStage::ExecStage(const ComputeUnitParams *p) : numSIMDs(p->num_SIMDs),
+ numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
+ vectorAluInstAvail(nullptr), glbMemInstAvail(nullptr),
+ shrMemInstAvail(nullptr), lastTimeInstExecuted(false),
+ thisTimeInstExecuted(false), instrExecuted (false),
+ executionResourcesUsed(0)
+{
+ numTransActiveIdle = 0;
+ idle_dur = 0;
+}
+
+void
+ExecStage::init(ComputeUnit *cu)
+{
+ computeUnit = cu;
+ _name = computeUnit->name() + ".ExecStage";
+ dispatchList = &computeUnit->dispatchList;
+ vectorAluInstAvail = &(computeUnit->vectorAluInstAvail);
+ glbMemInstAvail= &(computeUnit->glbMemInstAvail);
+ shrMemInstAvail= &(computeUnit->shrMemInstAvail);
+ idle_dur = 0;
+}
+
+void
+ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
+ if (stage == IdleExec) {
+ // count cycles of no vector ALU instruction executed
+ // even if one was the oldest in a WV of that vector SIMD unit
+ if (computeUnit->isVecAlu(unitId) && vectorAluInstAvail->at(unitId)) {
+ numCyclesWithNoInstrTypeIssued[unitId]++;
+ }
+
+ // count cycles of no global memory (vector) instruction executed
+ // even if one was the oldest in a WV of that vector SIMD unit
+ if (computeUnit->isGlbMem(unitId) && *glbMemInstAvail > 0) {
+ numCyclesWithNoInstrTypeIssued[unitId]++;
+ (*glbMemInstAvail)--;
+ }
+
+ // count cycles of no shared memory (vector) instruction executed
+ // even if one was the oldest in a WV of that vector SIMD unit
+ if (computeUnit->isShrMem(unitId) && *shrMemInstAvail > 0) {
+ numCyclesWithNoInstrTypeIssued[unitId]++;
+ (*shrMemInstAvail)--;
+ }
+ } else if (stage == BusyExec) {
+ // count the number of cycles an instruction to a specific unit
+ // was issued
+ numCyclesWithInstrTypeIssued[unitId]++;
+ thisTimeInstExecuted = true;
+ instrExecuted = true;
+ ++executionResourcesUsed;
+ } else if (stage == PostExec) {
+ // count the number of transitions from active to idle
+ if (lastTimeInstExecuted && !thisTimeInstExecuted) {
+ ++numTransActiveIdle;
+ }
+
+ if (!lastTimeInstExecuted && thisTimeInstExecuted) {
+ idleDur.sample(idle_dur);
+ idle_dur = 0;
+ } else if (!thisTimeInstExecuted) {
+ idle_dur++;
+ }
+
+ lastTimeInstExecuted = thisTimeInstExecuted;
+ // track the number of cycles we either issued one vector instruction
+ // or issued no instructions at all
+ if (instrExecuted) {
+ numCyclesWithInstrIssued++;
+ } else {
+ numCyclesWithNoIssue++;
+ }
+
+ spc.sample(executionResourcesUsed);
+ }
+}
+
+void
+ExecStage::initStatistics()
+{
+ instrExecuted = false;
+ executionResourcesUsed = 0;
+ thisTimeInstExecuted = false;
+}
+
+void
+ExecStage::exec()
+{
+ initStatistics();
+
+ for (int unitId = 0; unitId < (numSIMDs + numMemUnits); ++unitId) {
+ // if dispatch list for this execution resource is empty,
+ // skip this execution resource this cycle
+ if (dispatchList->at(unitId).second == EMPTY) {
+ collectStatistics(IdleExec, unitId);
+ continue;
+ }
+
+ collectStatistics(BusyExec, unitId);
+ // execute an instruction for the WF
+ dispatchList->at(unitId).first->exec();
+ // clear the dispatch list entry
+ dispatchList->at(unitId).second = EMPTY;
+ dispatchList->at(unitId).first = (Wavefront*)nullptr;
+ }
+
+ collectStatistics(PostExec, 0);
+}
+
+void
+ExecStage::regStats()
+{
+ numTransActiveIdle
+ .name(name() + ".num_transitions_active_to_idle")
+ .desc("number of CU transitions from active to idle")
+ ;
+
+ numCyclesWithNoIssue
+ .name(name() + ".num_cycles_with_no_issue")
+ .desc("number of cycles the CU issues nothing")
+ ;
+
+ numCyclesWithInstrIssued
+ .name(name() + ".num_cycles_with_instr_issued")
+ .desc("number of cycles the CU issued at least one instruction")
+ ;
+
+ spc
+ .init(0, numSIMDs + numMemUnits, 1)
+ .name(name() + ".spc")
+ .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
+ ;
+
+ idleDur
+ .init(0,75,5)
+ .name(name() + ".idle_duration_in_cycles")
+ .desc("duration of idle periods in cycles")
+ ;
+
+ numCyclesWithInstrTypeIssued
+ .init(numSIMDs + numMemUnits)
+ .name(name() + ".num_cycles_with_instrtype_issue")
+ .desc("Number of cycles at least one instruction of specific type "
+ "issued")
+ ;
+
+ numCyclesWithNoInstrTypeIssued
+ .init(numSIMDs + numMemUnits)
+ .name(name() + ".num_cycles_with_instr_type_no_issue")
+ .desc("Number of cycles no instruction of specific type issued")
+ ;
+
+ for (int i = 0; i < numSIMDs; ++i) {
+ numCyclesWithInstrTypeIssued.subname(i, csprintf("ALU%d",i));
+ numCyclesWithNoInstrTypeIssued.subname(i, csprintf("ALU%d",i));
+ }
+
+ numCyclesWithInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
+ numCyclesWithNoInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
+ numCyclesWithInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
+ numCyclesWithNoInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
+}
diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh
new file mode 100644
index 000000000..2de74366b
--- /dev/null
+++ b/src/gpu-compute/exec_stage.hh
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#ifndef __EXEC_STAGE_HH__
+#define __EXEC_STAGE_HH__
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "sim/stats.hh"
+
+class ComputeUnit;
+class Wavefront;
+struct ComputeUnitParams;
+
+enum STAT_STATUS
+{
+ IdleExec,
+ BusyExec,
+ PostExec
+};
+
+enum DISPATCH_STATUS
+{
+ EMPTY = 0,
+ FILLED
+};
+
+// Execution stage.
+// Each execution resource executes the
+// wave which is in its dispatch list.
+// The schedule stage is responsible for
+// adding a wave into each execution resource's
+// dispatch list.
+
+class ExecStage
+{
+ public:
+ ExecStage(const ComputeUnitParams* params);
+ ~ExecStage() { }
+ void init(ComputeUnit *cu);
+ void exec();
+
+ std::string name() { return _name; }
+ void regStats();
+ // number of idle cycles
+ Stats::Scalar numCyclesWithNoIssue;
+ // number of busy cycles
+ Stats::Scalar numCyclesWithInstrIssued;
+ // number of cycles (per execution unit) during which at least one
+ // instruction was issued to that unit
+ Stats::Vector numCyclesWithInstrTypeIssued;
+ // number of idle cycles (per execution unit) during which the unit issued
+ // no instruction targeting that unit, even though there is at least one
+ // Wavefront with such an instruction as the oldest
+ Stats::Vector numCyclesWithNoInstrTypeIssued;
+ // SIMDs active per cycle
+ Stats::Distribution spc;
+
+ private:
+ void collectStatistics(enum STAT_STATUS stage, int unitId);
+ void initStatistics();
+ ComputeUnit *computeUnit;
+ uint32_t numSIMDs;
+
+ // Number of memory execution resources;
+ // both global and local memory execution resources in CU
+ uint32_t numMemUnits;
+
+ // List of waves which will be dispatched to
+ // each execution resource. A FILLED implies
+ // dispatch list is non-empty and
+ // execution unit has something to execute
+ // this cycle. Currently, the dispatch list of
+ // an execution resource can hold only one wave because
+ // an execution resource can execute only one wave in a cycle.
+ // dispatchList is used to communicate between schedule
+ // and exec stage
+ std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
+ // flag per vector SIMD unit that is set when there is at least one
+ // WV that has a vector ALU instruction as the oldest in its
+ // Instruction Buffer
+ std::vector<bool> *vectorAluInstAvail;
+ int *glbMemInstAvail;
+ int *shrMemInstAvail;
+ bool lastTimeInstExecuted;
+ bool thisTimeInstExecuted;
+ bool instrExecuted;
+ Stats::Scalar numTransActiveIdle;
+ Stats::Distribution idleDur;
+ uint32_t executionResourcesUsed;
+ uint64_t idle_dur;
+ std::string _name;
+};
+
+#endif // __EXEC_STAGE_HH__
diff --git a/src/gpu-compute/fetch_stage.cc b/src/gpu-compute/fetch_stage.cc
new file mode 100644
index 000000000..1f5e6ded3
--- /dev/null
+++ b/src/gpu-compute/fetch_stage.cc
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez, Sooraj Puthoor
+ */
+
+#include "gpu-compute/fetch_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/wavefront.hh"
+
+FetchStage::FetchStage(const ComputeUnitParams* p) : numSIMDs(p->num_SIMDs),
+ computeUnit(nullptr)
+{
+ for (int j = 0; j < numSIMDs; ++j) {
+ FetchUnit newFetchUnit(p);
+ fetchUnit.push_back(newFetchUnit);
+ }
+}
+
+FetchStage::~FetchStage()
+{
+ fetchUnit.clear();
+}
+
+void
+FetchStage::init(ComputeUnit *cu)
+{
+ computeUnit = cu;
+ _name = computeUnit->name() + ".FetchStage";
+
+ for (int j = 0; j < numSIMDs; ++j) {
+ fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
+ fetchUnit[j].init(computeUnit);
+ }
+}
+
+void
+FetchStage::exec()
+{
+ for (int j = 0; j < numSIMDs; ++j) {
+ fetchUnit[j].exec();
+ }
+}
+
+void
+FetchStage::processFetchReturn(PacketPtr pkt)
+{
+ ComputeUnit::SQCPort::SenderState *sender_state =
+ safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
+
+ Wavefront *wavefront = sender_state->wavefront;
+
+ const unsigned num_instructions = pkt->req->getSize() /
+ sizeof(TheGpuISA::RawMachInst);
+
+ instFetchInstReturned.sample(num_instructions);
+ uint32_t simdId = wavefront->simdId;
+ fetchUnit[simdId].processFetchReturn(pkt);
+}
+
+void
+FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
+{
+ fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
+}
+
+void
+FetchStage::regStats()
+{
+ instFetchInstReturned
+ .init(1, 32, 1)
+ .name(name() + ".inst_fetch_instr_returned")
+ .desc("For each instruction fetch request recieved record how many "
+ "instructions you got from it")
+ ;
+}
diff --git a/src/gpu-compute/fetch_stage.hh b/src/gpu-compute/fetch_stage.hh
new file mode 100644
index 000000000..ce7faa8ac
--- /dev/null
+++ b/src/gpu-compute/fetch_stage.hh
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez, Sooraj Puthoor
+ */
+
+#ifndef __FETCH_STAGE_HH__
+#define __FETCH_STAGE_HH__
+
+#include <string>
+#include <vector>
+
+#include "gpu-compute/fetch_unit.hh"
+
+// Instruction fetch stage.
+// All dispatched wavefronts for all SIMDS are analyzed for the
+// need to fetch instructions. From the fetch eligible waves,
+// one wave is selected from each SIMD and fetch is initiated
+// for the selected waves.
+
+class ComputeUnit;
+class Wavefront;
+
+class FetchStage
+{
+ public:
+ FetchStage(const ComputeUnitParams* params);
+ ~FetchStage();
+ void init(ComputeUnit *cu);
+ void exec();
+ void processFetchReturn(PacketPtr pkt);
+ void fetch(PacketPtr pkt, Wavefront *wave);
+
+ // Stats related variables and methods
+ std::string name() { return _name; }
+ void regStats();
+ Stats::Distribution instFetchInstReturned;
+
+ private:
+ uint32_t numSIMDs;
+ ComputeUnit *computeUnit;
+
+ // List of fetch units. A fetch unit is
+ // instantiated per SIMD
+ std::vector<FetchUnit> fetchUnit;
+ std::string _name;
+};
+
+#endif // __FETCH_STAGE_HH__
diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc
new file mode 100644
index 000000000..1f0a7d78e
--- /dev/null
+++ b/src/gpu-compute/fetch_unit.cc
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Sooraj Puthoor
+ */
+
+#include "gpu-compute/fetch_unit.hh"
+
+#include "debug/GPUFetch.hh"
+#include "debug/GPUPort.hh"
+#include "debug/GPUTLB.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/ruby/system/RubySystem.hh"
+
+uint32_t FetchUnit::globalFetchUnitID;
+
+FetchUnit::FetchUnit(const ComputeUnitParams* params) :
+ timingSim(true),
+ computeUnit(nullptr),
+ fetchScheduler(params),
+ waveList(nullptr)
+{
+}
+
+FetchUnit::~FetchUnit()
+{
+ fetchQueue.clear();
+ fetchStatusQueue.clear();
+}
+
+void
+FetchUnit::init(ComputeUnit *cu)
+{
+ computeUnit = cu;
+ timingSim = computeUnit->shader->timingSim;
+ fetchQueue.clear();
+ fetchStatusQueue.resize(computeUnit->shader->n_wf);
+
+ for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
+ fetchStatusQueue[j] = std::make_pair(waveList->at(j), false);
+ }
+
+ fetchScheduler.bindList(&fetchQueue);
+}
+
+void
+FetchUnit::exec()
+{
+ // re-evaluate waves which are marked as not ready for fetch
+ for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
+ // Following code assumes 64-bit opertaion and all insts are
+ // represented by 64-bit pointers to inst objects.
+ Wavefront *curWave = fetchStatusQueue[j].first;
+ assert (curWave);
+
+ // The wavefront has to be active, the IB occupancy has to be
+ // 4 or less instructions and it can not have any branches to
+ // prevent speculative instruction fetches
+ if (!fetchStatusQueue[j].second) {
+ if (curWave->status == Wavefront::S_RUNNING &&
+ curWave->instructionBuffer.size() <= 4 &&
+ !curWave->instructionBufferHasBranch() &&
+ !curWave->pendingFetch) {
+ fetchQueue.push_back(curWave);
+ fetchStatusQueue[j].second = true;
+ }
+ }
+ }
+
+ // Fetch only if there is some wave ready to be fetched
+ // An empty fetchQueue will cause the schedular to panic
+ if (fetchQueue.size()) {
+ Wavefront *waveToBeFetched = fetchScheduler.chooseWave();
+ waveToBeFetched->pendingFetch = true;
+ fetchStatusQueue[waveToBeFetched->wfSlotId].second = false;
+ initiateFetch(waveToBeFetched);
+ }
+}
+
+void
+FetchUnit::initiateFetch(Wavefront *wavefront)
+{
+ // calculate the virtual address to fetch from the SQC
+ Addr vaddr = wavefront->pc() + wavefront->instructionBuffer.size();
+ vaddr = wavefront->base_ptr + vaddr * sizeof(GPUStaticInst*);
+
+ DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
+ computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
+
+ // Since this is an instruction prefetch, if you're split then just finish
+ // out the current line.
+ unsigned block_size = RubySystem::getBlockSizeBytes();
+ // check for split accesses
+ Addr split_addr = roundDown(vaddr + block_size - 1, block_size);
+ unsigned size = block_size;
+
+ if (split_addr > vaddr) {
+ // misaligned access, just grab the rest of the line
+ size = split_addr - vaddr;
+ }
+
+ // set up virtual request
+ Request *req = new Request(0, vaddr, size, Request::INST_FETCH,
+ computeUnit->masterId(), 0, 0, 0);
+
+ PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+ // This fetchBlock is kind of faux right now - because the translations so
+ // far don't actually return Data
+ uint64_t fetchBlock;
+ pkt->dataStatic(&fetchBlock);
+
+ if (timingSim) {
+ // SenderState needed on Return
+ pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront);
+
+ // Sender State needed by TLB hierarchy
+ pkt->senderState =
+ new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
+ computeUnit->shader->gpuTc,
+ false, pkt->senderState);
+
+ if (computeUnit->sqcTLBPort->isStalled()) {
+ assert(computeUnit->sqcTLBPort->retries.size() > 0);
+
+ DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
+ vaddr);
+
+ computeUnit->sqcTLBPort->retries.push_back(pkt);
+ } else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) {
+ // Stall the data port;
+ // No more packet is issued till
+ // ruby indicates resources are freed by
+ // a recvReqRetry() call back on this port.
+ computeUnit->sqcTLBPort->stallPort();
+
+ DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
+ vaddr);
+
+ computeUnit->sqcTLBPort->retries.push_back(pkt);
+ } else {
+ DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr);
+ }
+ } else {
+ pkt->senderState =
+ new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
+ computeUnit->shader->gpuTc);
+
+ computeUnit->sqcTLBPort->sendFunctional(pkt);
+
+ TheISA::GpuTLB::TranslationState *sender_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ delete sender_state->tlbEntry;
+ delete sender_state;
+ // fetch the instructions from the SQC when we operate in
+ // functional mode only
+ fetch(pkt, wavefront);
+ }
+}
+
+void
+FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
+{
+ assert(pkt->req->hasPaddr());
+ assert(pkt->req->hasSize());
+
+ DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n",
+ computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+ pkt->req->getPaddr());
+
+ // this is necessary because the GPU TLB receives packets instead of
+ // requests. when the translation is complete, all relevent fields in the
+ // request will be populated, but not in the packet. here we create the
+ // new packet so we can set the size, addr, and proper flags.
+ PacketPtr oldPkt = pkt;
+ pkt = new Packet(oldPkt->req, oldPkt->cmd);
+ delete oldPkt;
+
+ TheGpuISA::RawMachInst *data =
+ new TheGpuISA::RawMachInst[pkt->req->getSize() /
+ sizeof(TheGpuISA::RawMachInst)];
+
+ pkt->dataDynamic<TheGpuISA::RawMachInst>(data);
+
+ // New SenderState for the memory access
+ pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
+
+ if (timingSim) {
+ // translation is done. Send the appropriate timing memory request.
+
+ if (!computeUnit->sqcPort->sendTimingReq(pkt)) {
+ computeUnit->sqcPort->retries.push_back(std::make_pair(pkt,
+ wavefront));
+
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
+ computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+ pkt->req->getPaddr());
+ } else {
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
+ computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+ pkt->req->getPaddr());
+ }
+ } else {
+ computeUnit->sqcPort->sendFunctional(pkt);
+ processFetchReturn(pkt);
+ }
+}
+
+void
+FetchUnit::processFetchReturn(PacketPtr pkt)
+{
+ ComputeUnit::SQCPort::SenderState *sender_state =
+ safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
+
+ Wavefront *wavefront = sender_state->wavefront;
+
+ DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
+ "%d bytes, %d instructions!\n", computeUnit->cu_id,
+ wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(),
+ pkt->req->getSize(), pkt->req->getSize() /
+ sizeof(TheGpuISA::RawMachInst));
+
+ if (wavefront->dropFetch) {
+ assert(wavefront->instructionBuffer.empty());
+ wavefront->dropFetch = false;
+ } else {
+ TheGpuISA::RawMachInst *inst_index_ptr =
+ (TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>();
+
+ assert(wavefront->instructionBuffer.size() <= 4);
+
+ for (int i = 0; i < pkt->req->getSize() /
+ sizeof(TheGpuISA::RawMachInst); ++i) {
+ GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]);
+
+ assert(inst_ptr);
+ DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n",
+ computeUnit->cu_id, wavefront->simdId,
+ wavefront->wfSlotId, inst_ptr->disassemble());
+
+ GPUDynInstPtr gpuDynInst =
+ std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr,
+ computeUnit->getAndIncSeqNum());
+
+ wavefront->instructionBuffer.push_back(gpuDynInst);
+ }
+ }
+
+ wavefront->pendingFetch = false;
+
+ delete pkt->senderState;
+ delete pkt->req;
+ delete pkt;
+}
+
+void
+FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
+{
+ waveList = wave_list;
+}
diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh
new file mode 100644
index 000000000..c7c6afb3c
--- /dev/null
+++ b/src/gpu-compute/fetch_unit.hh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Sooraj Puthoor
+ */
+
+#ifndef __FETCH_UNIT_HH__
+#define __FETCH_UNIT_HH__
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arch/gpu_decoder.hh"
+#include "base/statistics.hh"
+#include "config/the_gpu_isa.hh"
+#include "gpu-compute/scheduler.hh"
+#include "mem/packet.hh"
+
+class ComputeUnit;
+class Wavefront;
+
+class FetchUnit
+{
+ public:
+ FetchUnit(const ComputeUnitParams* params);
+ ~FetchUnit();
+ void init(ComputeUnit *cu);
+ void exec();
+ void bindWaveList(std::vector<Wavefront*> *list);
+ void initiateFetch(Wavefront *wavefront);
+ void fetch(PacketPtr pkt, Wavefront *wavefront);
+ void processFetchReturn(PacketPtr pkt);
+ static uint32_t globalFetchUnitID;
+
+ private:
+ bool timingSim;
+ ComputeUnit *computeUnit;
+ TheGpuISA::Decoder decoder;
+
+ // Fetch scheduler; Selects one wave from
+ // the fetch queue for instruction fetching.
+ // The selection is made according to
+ // a scheduling policy
+ Scheduler fetchScheduler;
+
+ // Stores the list of waves that are
+ // ready to be fetched this cycle
+ std::vector<Wavefront*> fetchQueue;
+
+ // Stores the fetch status of all waves dispatched to this SIMD.
+ // TRUE implies the wave is ready to fetch and is already
+ // moved to fetchQueue
+ std::vector<std::pair<Wavefront*, bool>> fetchStatusQueue;
+
+ // Pointer to list of waves dispatched on to this SIMD unit
+ std::vector<Wavefront*> *waveList;
+};
+
+#endif // __FETCH_UNIT_HH__
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
new file mode 100644
index 000000000..913327412
--- /dev/null
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#include "gpu-compute/global_memory_pipeline.hh"
+
+#include "debug/GPUMem.hh"
+#include "debug/GPUReg.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
+ computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
+ inflightStores(0), inflightLoads(0)
+{
+}
+
+void
+GlobalMemPipeline::init(ComputeUnit *cu)
+{
+ computeUnit = cu;
+ globalMemSize = computeUnit->shader->globalMemSize;
+ _name = computeUnit->name() + ".GlobalMemPipeline";
+}
+
+void
+GlobalMemPipeline::exec()
+{
+ // apply any returned global memory operations
+ GPUDynInstPtr m = !gmReturnedLoads.empty() ? gmReturnedLoads.front() :
+ !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr;
+
+ bool accessVrf = true;
+ // check the VRF to see if the operands of a load (or load component
+ // of an atomic) are accessible
+ if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) {
+ Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+ accessVrf =
+ w->computeUnit->vrf[m->simdId]->
+ vrfOperandAccessReady(m->seqNum(), w, m,
+ VrfAccessType::WRITE);
+ }
+
+ if ((!gmReturnedStores.empty() || !gmReturnedLoads.empty()) &&
+ m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
+ accessVrf && m->statusBitVector == VectorMask(0) &&
+ (computeUnit->shader->coissue_return ||
+ computeUnit->wfWait.at(m->pipeId).rdy())) {
+
+ if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
+ doGmReturn<uint32_t, uint8_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
+ doGmReturn<uint32_t, uint16_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
+ doGmReturn<uint32_t, uint32_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
+ doGmReturn<int32_t, int8_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
+ doGmReturn<int32_t, int16_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
+ doGmReturn<int32_t, int32_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
+ doGmReturn<float, Float16>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
+ doGmReturn<float, float>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
+ doGmReturn<uint64_t, uint8_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
+ doGmReturn<uint64_t, uint16_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
+ doGmReturn<uint64_t, uint32_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
+ doGmReturn<uint64_t, uint64_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
+ doGmReturn<int64_t, int8_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
+ doGmReturn<int64_t, int16_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
+ doGmReturn<int64_t, int32_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
+ doGmReturn<int64_t, int64_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
+ doGmReturn<double, Float16>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
+ doGmReturn<double, float>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
+ doGmReturn<double, double>(m);
+ }
+
+ // If pipeline has executed a global memory instruction
+ // execute global memory packets and issue global
+ // memory packets to DTLB
+ if (!gmIssuedRequests.empty()) {
+ GPUDynInstPtr mp = gmIssuedRequests.front();
+ if (mp->m_op == Enums::MO_LD ||
+ (mp->m_op >= Enums::MO_AAND && mp->m_op <= Enums::MO_AMIN) ||
+ (mp->m_op >= Enums::MO_ANRAND && mp->m_op <= Enums::MO_ANRMIN)) {
+
+ if (inflightLoads >= gmQueueSize) {
+ return;
+ } else {
+ ++inflightLoads;
+ }
+ } else {
+ if (inflightStores >= gmQueueSize) {
+ return;
+ } else {
+ ++inflightStores;
+ }
+ }
+
+ mp->initiateAcc(mp);
+ gmIssuedRequests.pop();
+
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = %s\n",
+ computeUnit->cu_id, mp->simdId, mp->wfSlotId,
+ Enums::MemOpTypeStrings[mp->m_op]);
+ }
+}
+
+template<typename c0, typename c1>
+void
+GlobalMemPipeline::doGmReturn(GPUDynInstPtr m)
+{
+ Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+ // Return data to registers
+ if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
+ gmReturnedLoads.pop();
+ assert(inflightLoads > 0);
+ --inflightLoads;
+
+ if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) {
+ std::vector<uint32_t> regVec;
+ // iterate over number of destination register operands since
+ // this is a load or atomic operation
+ for (int k = 0; k < m->n_reg; ++k) {
+ assert((sizeof(c1) * m->n_reg) <= MAX_WIDTH_FOR_MEM_INST);
+ int dst = m->dst_reg + k;
+
+ if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
+ dst = m->dst_reg_vec[k];
+ // virtual->physical VGPR mapping
+ int physVgpr = w->remap(dst, sizeof(c0), 1);
+ // save the physical VGPR index
+ regVec.push_back(physVgpr);
+ c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+
+ for (int i = 0; i < VSZ; ++i) {
+ if (m->exec_mask[i]) {
+ DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
+ "$%s%d <- %d global ld done (src = wavefront "
+ "ld inst)\n", w->computeUnit->cu_id, w->simdId,
+ w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
+ dst, *p1);
+ // write the value into the physical VGPR. This is a
+ // purely functional operation. No timing is modeled.
+ w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
+ *p1, i);
+ }
+ ++p1;
+ }
+ }
+
+ // Schedule the write operation of the load data on the VRF.
+ // This simply models the timing aspect of the VRF write operation.
+ // It does not modify the physical VGPR.
+ loadVrfBankConflictCycles +=
+ w->computeUnit->vrf[w->simdId]->exec(m->seqNum(),
+ w, regVec, sizeof(c0),
+ m->time);
+ }
+ } else {
+ gmReturnedStores.pop();
+ assert(inflightStores > 0);
+ --inflightStores;
+ }
+
+ // Decrement outstanding register count
+ computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1);
+
+ if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) ||
+ MO_H(m->m_op)) {
+ computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_gm, m->time,
+ -1);
+ }
+
+ if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
+ computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_gm, m->time,
+ -1);
+ }
+
+ // Mark write bus busy for appropriate amount of time
+ computeUnit->glbMemToVrfBus.set(m->time);
+ if (!computeUnit->shader->coissue_return)
+ w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+}
+
+void
+GlobalMemPipeline::regStats()
+{
+ loadVrfBankConflictCycles
+ .name(name() + ".load_vrf_bank_conflict_cycles")
+ .desc("total number of cycles GM data are delayed before updating "
+ "the VRF")
+ ;
+}
diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh
new file mode 100644
index 000000000..ed49f6f6b
--- /dev/null
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#ifndef __GLOBAL_MEMORY_PIPELINE_HH__
+#define __GLOBAL_MEMORY_PIPELINE_HH__
+
+#include <queue>
+#include <string>
+
+#include "gpu-compute/misc.hh"
+#include "params/ComputeUnit.hh"
+#include "sim/stats.hh"
+
+/*
+ * @file global_memory_pipeline.hh
+ *
+ * The global memory pipeline issues newly created global memory packets
+ * from the pipeline to DTLB. The exec() method of the memory packet issues
+ * the packet to the DTLB if there is space available in the return fifo.
+ * This stage also retires previously issued loads and stores that have
+ * returned from the memory sub-system.
+ */
+
+class ComputeUnit;
+
+class GlobalMemPipeline
+{
+ public:
+ GlobalMemPipeline(const ComputeUnitParams *params);
+ void init(ComputeUnit *cu);
+ void exec();
+
+ template<typename c0, typename c1> void doGmReturn(GPUDynInstPtr m);
+
+ std::queue<GPUDynInstPtr> &getGMReqFIFO() { return gmIssuedRequests; }
+ std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
+ std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
+
+ bool
+ isGMLdRespFIFOWrRdy() const
+ {
+ return gmReturnedLoads.size() < gmQueueSize;
+ }
+
+ bool
+ isGMStRespFIFOWrRdy() const
+ {
+ return gmReturnedStores.size() < gmQueueSize;
+ }
+
+ bool
+ isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
+ {
+ return (gmIssuedRequests.size() + pendReqs) < gmQueueSize;
+ }
+
+ const std::string &name() const { return _name; }
+ void regStats();
+
+ private:
+ ComputeUnit *computeUnit;
+ std::string _name;
+ int gmQueueSize;
+
+ // number of cycles of delaying the update of a VGPR that is the
+ // target of a load instruction (or the load component of an atomic)
+ // The delay is due to VRF bank conflicts
+ Stats::Scalar loadVrfBankConflictCycles;
+ // Counters to track the inflight loads and stores
+ // so that we can provide the proper backpressure
+ // on the number of inflight memory operations.
+ int inflightStores;
+ int inflightLoads;
+
+ // The size of global memory.
+ int globalMemSize;
+
+ // Global Memory Request FIFO: all global memory requests
+ // are issued to this FIFO from the memory pipelines
+ std::queue<GPUDynInstPtr> gmIssuedRequests;
+
+ // Globa Store Response FIFO: all responses of global memory
+ // stores are sent to this FIFO from TCP
+ std::queue<GPUDynInstPtr> gmReturnedStores;
+
+ // Global Load Response FIFO: all responses of global memory
+ // loads are sent to this FIFO from TCP
+ std::queue<GPUDynInstPtr> gmReturnedLoads;
+};
+
+#endif // __GLOBAL_MEMORY_PIPELINE_HH__
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
new file mode 100644
index 000000000..83e348dbe
--- /dev/null
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_dyn_inst.hh"
+
+#include "debug/GPUMem.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+
+GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
+ GPUStaticInst *_staticInst, uint64_t instSeqNum)
+ : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF),
+ memoryOrder(Enums::MEMORY_ORDER_NONE), useContinuation(false),
+ statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum)
+{
+ tlbHitLevel.assign(VSZ, -1);
+}
+
+void
+GPUDynInst::execute()
+{
+ GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(cu, wf, staticInst,
+ _seqNum);
+ staticInst->execute(gpuDynInst);
+}
+
+int
+GPUDynInst::numSrcRegOperands()
+{
+ return staticInst->numSrcRegOperands();
+}
+
+int
+GPUDynInst::numDstRegOperands()
+{
+ return staticInst->numDstRegOperands();
+}
+
+int
+GPUDynInst::getNumOperands()
+{
+ return staticInst->getNumOperands();
+}
+
+bool
+GPUDynInst::isVectorRegister(int operandIdx)
+{
+ return staticInst->isVectorRegister(operandIdx);
+}
+
+bool
+GPUDynInst::isScalarRegister(int operandIdx)
+{
+ return staticInst->isVectorRegister(operandIdx);
+}
+
+int
+GPUDynInst::getRegisterIndex(int operandIdx)
+{
+ return staticInst->getRegisterIndex(operandIdx);
+}
+
+int
+GPUDynInst::getOperandSize(int operandIdx)
+{
+ return staticInst->getOperandSize(operandIdx);
+}
+
+bool
+GPUDynInst::isDstOperand(int operandIdx)
+{
+ return staticInst->isDstOperand(operandIdx);
+}
+
+bool
+GPUDynInst::isSrcOperand(int operandIdx)
+{
+ return staticInst->isSrcOperand(operandIdx);
+}
+
+bool
+GPUDynInst::isArgLoad()
+{
+ return staticInst->isArgLoad();
+}
+
+const std::string&
+GPUDynInst::disassemble() const
+{
+ return staticInst->disassemble();
+}
+
+uint64_t
+GPUDynInst::seqNum() const
+{
+ return _seqNum;
+}
+
+Enums::OpType
+GPUDynInst::opType()
+{
+ return staticInst->o_type;
+}
+
+Enums::StorageClassType
+GPUDynInst::executedAs()
+{
+ return staticInst->executed_as;
+}
+
+// Process a memory instruction and (if necessary) submit timing request
+void
+GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
+{
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n",
+ cu->cu_id, simdId, wfSlotId, exec_mask);
+
+ staticInst->initiateAcc(gpuDynInst);
+ time = 0;
+}
+
+bool
+GPUDynInst::scalarOp() const
+{
+ return staticInst->scalarOp();
+}
+
+void
+GPUDynInst::updateStats()
+{
+ if (staticInst->isLocalMem()) {
+ // access to LDS (shared) memory
+ cu->dynamicLMemInstrCnt++;
+ } else {
+ // access to global memory
+
+ // update PageDivergence histogram
+ int number_pages_touched = cu->pagesTouched.size();
+ assert(number_pages_touched);
+ cu->pageDivergenceDist.sample(number_pages_touched);
+
+ std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;
+
+ for (auto it : cu->pagesTouched) {
+ // see if this page has been touched before. if not, this also
+ // inserts the page into the table.
+ ret = cu->pageAccesses
+ .insert(ComputeUnit::pageDataStruct::value_type(it.first,
+ std::make_pair(1, it.second)));
+
+ // if yes, then update the stats
+ if (!ret.second) {
+ ret.first->second.first++;
+ ret.first->second.second += it.second;
+ }
+ }
+
+ cu->pagesTouched.clear();
+
+ // total number of memory instructions (dynamic)
+ // Atomics are counted as a single memory instruction.
+ // this is # memory instructions per wavefronts, not per workitem
+ cu->dynamicGMemInstrCnt++;
+ }
+}
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
new file mode 100644
index 000000000..e44d8f80d
--- /dev/null
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __GPU_DYN_INST_HH__
+#define __GPU_DYN_INST_HH__
+
+#include <cstdint>
+#include <string>
+
+#include "enums/GenericMemoryOrder.hh"
+#include "enums/GenericMemoryScope.hh"
+#include "enums/MemOpType.hh"
+#include "enums/MemType.hh"
+#include "enums/OpType.hh"
+#include "enums/StorageClassType.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_exec_context.hh"
+
+class GPUStaticInst;
+
+template<typename T>
+class AtomicOpAnd : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T a;
+
+ AtomicOpAnd(T _a) : a(_a) { }
+ void execute(T *b) { *b &= a; }
+};
+
+template<typename T>
+class AtomicOpOr : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T a;
+ AtomicOpOr(T _a) : a(_a) { }
+ void execute(T *b) { *b |= a; }
+};
+
+template<typename T>
+class AtomicOpXor : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T a;
+ AtomicOpXor(T _a) : a(_a) {}
+ void execute(T *b) { *b ^= a; }
+};
+
+template<typename T>
+class AtomicOpCAS : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T c;
+ T s;
+
+ ComputeUnit *computeUnit;
+
+ AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
+ : c(_c), s(_s), computeUnit(compute_unit) { }
+
+ void
+ execute(T *b)
+ {
+ computeUnit->numCASOps++;
+
+ if (*b == c) {
+ *b = s;
+ } else {
+ computeUnit->numFailedCASOps++;
+ }
+
+ if (computeUnit->xact_cas_mode) {
+ computeUnit->xactCasLoadMap.clear();
+ }
+ }
+};
+
+template<typename T>
+class AtomicOpExch : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T a;
+ AtomicOpExch(T _a) : a(_a) { }
+ void execute(T *b) { *b = a; }
+};
+
+template<typename T>
+class AtomicOpAdd : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T a;
+ AtomicOpAdd(T _a) : a(_a) { }
+ void execute(T *b) { *b += a; }
+};
+
+template<typename T>
+class AtomicOpSub : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T a;
+ AtomicOpSub(T _a) : a(_a) { }
+ void execute(T *b) { *b -= a; }
+};
+
+template<typename T>
+class AtomicOpInc : public TypedAtomicOpFunctor<T>
+{
+ public:
+ AtomicOpInc() { }
+ void execute(T *b) { *b += 1; }
+};
+
+template<typename T>
+class AtomicOpDec : public TypedAtomicOpFunctor<T>
+{
+ public:
+ AtomicOpDec() {}
+ void execute(T *b) { *b -= 1; }
+};
+
+template<typename T>
+class AtomicOpMax : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T a;
+ AtomicOpMax(T _a) : a(_a) { }
+
+ void
+ execute(T *b)
+ {
+ if (a > *b)
+ *b = a;
+ }
+};
+
+template<typename T>
+class AtomicOpMin : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T a;
+ AtomicOpMin(T _a) : a(_a) {}
+
+ void
+ execute(T *b)
+ {
+ if (a < *b)
+ *b = a;
+ }
+};
+
+#define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN)
+#define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN)
+#define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN)
+
+typedef enum
+{
+ VT_32,
+ VT_64,
+} vgpr_type;
+
+typedef enum
+{
+ SEG_PRIVATE,
+ SEG_SPILL,
+ SEG_GLOBAL,
+ SEG_SHARED,
+ SEG_READONLY,
+ SEG_FLAT
+} seg_type;
+
+class GPUDynInst : public GPUExecContext
+{
+ public:
+ GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
+ uint64_t instSeqNum);
+
+ void execute();
+ int numSrcRegOperands();
+ int numDstRegOperands();
+ int getNumOperands();
+ bool isVectorRegister(int operandIdx);
+ bool isScalarRegister(int operandIdx);
+ int getRegisterIndex(int operandIdx);
+ int getOperandSize(int operandIdx);
+ bool isDstOperand(int operandIdx);
+ bool isSrcOperand(int operandIdx);
+ bool isArgLoad();
+
+ const std::string &disassemble() const;
+
+ uint64_t seqNum() const;
+
+ Enums::OpType opType();
+ Enums::StorageClassType executedAs();
+
+ // The address of the memory operation
+ Addr addr[VSZ];
+ Addr pAddr;
+
+ // The data to get written
+ uint8_t d_data[VSZ * 16];
+ // Additional data (for atomics)
+ uint8_t a_data[VSZ * 8];
+ // Additional data (for atomics)
+ uint8_t x_data[VSZ * 8];
+ // The execution mask
+ VectorMask exec_mask;
+
+ // The memory type (M_U32, M_S32, ...)
+ Enums::MemType m_type;
+ // The memory operation (MO_LD, MO_ST, ...)
+ Enums::MemOpType m_op;
+ Enums::GenericMemoryOrder memoryOrder;
+
+ // Scope of the request
+ Enums::GenericMemoryScope scope;
+ // The memory segment (SEG_SHARED, SEG_GLOBAL, ...)
+ seg_type s_type;
+ // The equivalency class
+ int equiv;
+ // The return VGPR type (VT_32 or VT_64)
+ vgpr_type v_type;
+ // Number of VGPR's accessed (1, 2, or 4)
+ int n_reg;
+ // The return VGPR index
+ int dst_reg;
+ // There can be max 4 dest regs>
+ int dst_reg_vec[4];
+ // SIMD where the WF of the memory instruction has been mapped to
+ int simdId;
+ // unique id of the WF where the memory instruction belongs to
+ int wfDynId;
+ // The kernel id of the requesting wf
+ int kern_id;
+ // The CU id of the requesting wf
+ int cu_id;
+ // HW slot id where the WF is mapped to inside a SIMD unit
+ int wfSlotId;
+ // execution pipeline id where the memory instruction has been scheduled
+ int pipeId;
+ // The execution time of this operation
+ Tick time;
+ // The latency of this operation
+ WaitClass latency;
+ // A list of bank conflicts for the 4 cycles.
+ uint32_t bc[4];
+
+ // A pointer to ROM
+ uint8_t *rom;
+ // The size of the READONLY segment
+ int sz_rom;
+
+ // Initiate the specified memory operation, by creating a
+ // memory request and sending it off to the memory system.
+ void initiateAcc(GPUDynInstPtr gpuDynInst);
+
+ void updateStats();
+
+ GPUStaticInst* staticInstruction() { return staticInst; }
+
+ // Is the instruction a scalar or vector op?
+ bool scalarOp() const;
+
+ /*
+ * Loads/stores/atomics may have acquire/release semantics associated
+ * withthem. Some protocols want to see the acquire/release as separate
+ * requests from the load/store/atomic. We implement that separation
+ * using continuations (i.e., a function pointer with an object associated
+ * with it). When, for example, the front-end generates a store with
+ * release semantics, we will first issue a normal store and set the
+ * continuation in the GPUDynInst to a function that generate a
+ * release request. That continuation will be called when the normal
+ * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
+ * continuation will be called in the context of the same GPUDynInst
+ * that generated the initial store.
+ */
+ std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
+
+ // when true, call execContinuation when response arrives
+ bool useContinuation;
+
+ template<typename c0> AtomicOpFunctor*
+ makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op)
+ {
+ using namespace Enums;
+
+ switch(op) {
+ case MO_AAND:
+ case MO_ANRAND:
+ return new AtomicOpAnd<c0>(*reg0);
+ case MO_AOR:
+ case MO_ANROR:
+ return new AtomicOpOr<c0>(*reg0);
+ case MO_AXOR:
+ case MO_ANRXOR:
+ return new AtomicOpXor<c0>(*reg0);
+ case MO_ACAS:
+ case MO_ANRCAS:
+ return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
+ case MO_AEXCH:
+ case MO_ANREXCH:
+ return new AtomicOpExch<c0>(*reg0);
+ case MO_AADD:
+ case MO_ANRADD:
+ return new AtomicOpAdd<c0>(*reg0);
+ case MO_ASUB:
+ case MO_ANRSUB:
+ return new AtomicOpSub<c0>(*reg0);
+ case MO_AINC:
+ case MO_ANRINC:
+ return new AtomicOpInc<c0>();
+ case MO_ADEC:
+ case MO_ANRDEC:
+ return new AtomicOpDec<c0>();
+ case MO_AMAX:
+ case MO_ANRMAX:
+ return new AtomicOpMax<c0>(*reg0);
+ case MO_AMIN:
+ case MO_ANRMIN:
+ return new AtomicOpMin<c0>(*reg0);
+ default:
+ panic("Unrecognized atomic operation");
+ }
+ }
+
+ void
+ setRequestFlags(Request *req, bool setMemOrder=true)
+ {
+ // currently these are the easy scopes to deduce
+ switch (s_type) {
+ case SEG_PRIVATE:
+ req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
+ break;
+ case SEG_SPILL:
+ req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
+ break;
+ case SEG_GLOBAL:
+ req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
+ break;
+ case SEG_READONLY:
+ req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
+ break;
+ case SEG_SHARED:
+ req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
+ break;
+ case SEG_FLAT:
+ // TODO: translate to correct scope
+ assert(false);
+ default:
+ panic("Bad segment type");
+ break;
+ }
+
+ switch (scope) {
+ case Enums::MEMORY_SCOPE_NONE:
+ case Enums::MEMORY_SCOPE_WORKITEM:
+ break;
+ case Enums::MEMORY_SCOPE_WAVEFRONT:
+ req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+ Request::WAVEFRONT_SCOPE);
+ break;
+ case Enums::MEMORY_SCOPE_WORKGROUP:
+ req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+ Request::WORKGROUP_SCOPE);
+ break;
+ case Enums::MEMORY_SCOPE_DEVICE:
+ req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+ Request::DEVICE_SCOPE);
+ break;
+ case Enums::MEMORY_SCOPE_SYSTEM:
+ req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+ Request::SYSTEM_SCOPE);
+ break;
+ default:
+ panic("Bad scope type");
+ break;
+ }
+
+ if (setMemOrder) {
+ // set acquire and release flags
+ switch (memoryOrder){
+ case Enums::MEMORY_ORDER_SC_ACQUIRE:
+ req->setFlags(Request::ACQUIRE);
+ break;
+ case Enums::MEMORY_ORDER_SC_RELEASE:
+ req->setFlags(Request::RELEASE);
+ break;
+ case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE:
+ req->setFlags(Request::ACQUIRE | Request::RELEASE);
+ break;
+ default:
+ break;
+ }
+ }
+
+ // set atomic type
+ // currently, the instruction genenerator only produces atomic return
+ // but a magic instruction can produce atomic no return
+ if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB ||
+ m_op == Enums::MO_AAND || m_op == Enums::MO_AOR ||
+ m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX ||
+ m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC ||
+ m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH ||
+ m_op == Enums::MO_ACAS) {
+ req->setFlags(Request::ATOMIC_RETURN_OP);
+ } else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB ||
+ m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR ||
+ m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX ||
+ m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC ||
+ m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH ||
+ m_op == Enums::MO_ANRCAS) {
+ req->setFlags(Request::ATOMIC_NO_RETURN_OP);
+ }
+ }
+
+ // Map returned packets and the addresses they satisfy with which lane they
+ // were requested from
+ typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
+ StatusVector memStatusVector;
+
+ // Track the status of memory requests per lane, a bit per lane
+ VectorMask statusBitVector;
+ // for ld_v# or st_v#
+ std::vector<int> statusVector;
+ std::vector<int> tlbHitLevel;
+
+ private:
+ GPUStaticInst *staticInst;
+ uint64_t _seqNum;
+};
+
+#endif // __GPU_DYN_INST_HH__
diff --git a/src/gpu-compute/gpu_exec_context.cc b/src/gpu-compute/gpu_exec_context.cc
new file mode 100644
index 000000000..4af69c41e
--- /dev/null
+++ b/src/gpu-compute/gpu_exec_context.cc
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_exec_context.hh"
+
+GPUExecContext::GPUExecContext(ComputeUnit *_cu, Wavefront *_wf)
+ : cu(_cu), wf(_wf)
+{
+}
+
+ComputeUnit*
+GPUExecContext::computeUnit()
+{
+ return cu;
+}
+
+Wavefront*
+GPUExecContext::wavefront()
+{
+ return wf;
+}
diff --git a/src/gpu-compute/gpu_exec_context.hh b/src/gpu-compute/gpu_exec_context.hh
new file mode 100644
index 000000000..a3deb9b8f
--- /dev/null
+++ b/src/gpu-compute/gpu_exec_context.hh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __GPU_EXEC_CONTEXT_HH__
+#define __GPU_EXEC_CONTEXT_HH__
+
+class ComputeUnit;
+class Wavefront;
+
+class GPUExecContext
+{
+ public:
+ GPUExecContext(ComputeUnit *_cu, Wavefront *_wf);
+ Wavefront* wavefront();
+ ComputeUnit* computeUnit();
+
+ protected:
+ ComputeUnit *cu;
+ Wavefront *wf;
+};
+
+#endif // __GPU_EXEC_CONTEXT_HH__
diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc
new file mode 100644
index 000000000..bcb8a5f3d
--- /dev/null
+++ b/src/gpu-compute/gpu_static_inst.cc
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_static_inst.hh"
+
+GPUStaticInst::GPUStaticInst(const std::string &opcode)
+ : o_type(Enums::OT_ALU), executed_as(Enums::SC_NONE), opcode(opcode),
+ _instNum(0), _scalarOp(false)
+{
+}
diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh
new file mode 100644
index 000000000..c1de28427
--- /dev/null
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __GPU_STATIC_INST_HH__
+#define __GPU_STATIC_INST_HH__
+
+/*
+ * @file gpu_static_inst.hh
+ *
+ * Defines the base class representing static instructions for the GPU. The
+ * instructions are "static" because they contain no dynamic instruction
+ * information. GPUStaticInst corresponds to the StaticInst class for the CPU
+ * models.
+ */
+
+#include <cstdint>
+#include <string>
+
+#include "enums/OpType.hh"
+#include "enums/StorageClassType.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/misc.hh"
+
+class BaseOperand;
+class BaseRegOperand;
+class Wavefront;
+
+class GPUStaticInst
+{
+ public:
+ GPUStaticInst(const std::string &opcode);
+
+ void instNum(int num) { _instNum = num; }
+
+ int instNum() { return _instNum; }
+
+ void ipdInstNum(int num) { _ipdInstNum = num; }
+
+ int ipdInstNum() const { return _ipdInstNum; }
+
+ virtual void execute(GPUDynInstPtr gpuDynInst) = 0;
+ virtual void generateDisassembly() = 0;
+ virtual const std::string &disassemble() = 0;
+ virtual int getNumOperands() = 0;
+ virtual bool isCondRegister(int operandIndex) = 0;
+ virtual bool isScalarRegister(int operandIndex) = 0;
+ virtual bool isVectorRegister(int operandIndex) = 0;
+ virtual bool isSrcOperand(int operandIndex) = 0;
+ virtual bool isDstOperand(int operandIndex) = 0;
+ virtual int getOperandSize(int operandIndex) = 0;
+ virtual int getRegisterIndex(int operandIndex) = 0;
+ virtual int numDstRegOperands() = 0;
+ virtual int numSrcRegOperands() = 0;
+
+ /*
+ * Most instructions (including all HSAIL instructions)
+ * are vector ops, so _scalarOp will be false by default.
+ * Derived instruction objects that are scalar ops must
+ * set _scalarOp to true in their constructors.
+ */
+ bool scalarOp() const { return _scalarOp; }
+
+ virtual bool isLocalMem() const
+ {
+ fatal("calling isLocalMem() on non-memory instruction.\n");
+
+ return false;
+ }
+
+ bool isArgLoad() { return false; }
+ virtual uint32_t instSize() = 0;
+
+ // only used for memory instructions
+ virtual void
+ initiateAcc(GPUDynInstPtr gpuDynInst)
+ {
+ fatal("calling initiateAcc() on a non-memory instruction.\n");
+ }
+
+ virtual uint32_t getTargetPc() { return 0; }
+
+ /**
+ * Query whether the instruction is an unconditional jump i.e., the jump
+ * is always executed because there is no condition to be evaluated.
+ *
+ * If the instruction is not of branch type, the result is always false.
+ *
+ * @return True if the instruction is an unconditional jump.
+ */
+ virtual bool unconditionalJumpInstruction() { return false; }
+
+ static uint64_t dynamic_id_count;
+
+ Enums::OpType o_type;
+ // For flat memory accesses
+ Enums::StorageClassType executed_as;
+
+ protected:
+ virtual void
+ execLdAcq(GPUDynInstPtr gpuDynInst)
+ {
+ fatal("calling execLdAcq() on a non-load instruction.\n");
+ }
+
+ virtual void
+ execSt(GPUDynInstPtr gpuDynInst)
+ {
+ fatal("calling execLdAcq() on a non-load instruction.\n");
+ }
+
+ virtual void
+ execAtomic(GPUDynInstPtr gpuDynInst)
+ {
+ fatal("calling execAtomic() on a non-atomic instruction.\n");
+ }
+
+ virtual void
+ execAtomicAcq(GPUDynInstPtr gpuDynInst)
+ {
+ fatal("calling execAtomicAcq() on a non-atomic instruction.\n");
+ }
+
+ const std::string opcode;
+ std::string disassembly;
+ int _instNum;
+ /**
+ * Identifier of the immediate post-dominator instruction.
+ */
+ int _ipdInstNum;
+
+ bool _scalarOp;
+};
+
+#endif // __GPU_STATIC_INST_HH__
diff --git a/src/gpu-compute/gpu_tlb.cc b/src/gpu-compute/gpu_tlb.cc
new file mode 100644
index 000000000..de005fd04
--- /dev/null
+++ b/src/gpu-compute/gpu_tlb.cc
@@ -0,0 +1,1801 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#include "gpu-compute/gpu_tlb.hh"
+
+#include <cmath>
+#include <cstring>
+
+#include "arch/x86/faults.hh"
+#include "arch/x86/insts/microldstop.hh"
+#include "arch/x86/pagetable.hh"
+#include "arch/x86/pagetable_walker.hh"
+#include "arch/x86/regs/misc.hh"
+#include "arch/x86/x86_traits.hh"
+#include "base/bitfield.hh"
+#include "base/output.hh"
+#include "base/trace.hh"
+#include "cpu/base.hh"
+#include "cpu/thread_context.hh"
+#include "debug/GPUPrefetch.hh"
+#include "debug/GPUTLB.hh"
+#include "mem/packet_access.hh"
+#include "mem/page_table.hh"
+#include "mem/request.hh"
+#include "sim/process.hh"
+
+namespace X86ISA
+{
+
+ GpuTLB::GpuTLB(const Params *p)
+ : MemObject(p), configAddress(0), size(p->size),
+ cleanupEvent(this, false, Event::Maximum_Pri), exitEvent(this)
+ {
+ assoc = p->assoc;
+ assert(assoc <= size);
+ numSets = size/assoc;
+ allocationPolicy = p->allocationPolicy;
+ hasMemSidePort = false;
+ accessDistance = p->accessDistance;
+ clock = p->clk_domain->clockPeriod();
+
+ tlb = new GpuTlbEntry[size];
+ std::memset(tlb, 0, sizeof(GpuTlbEntry) * size);
+
+ freeList.resize(numSets);
+ entryList.resize(numSets);
+
+ for (int set = 0; set < numSets; ++set) {
+ for (int way = 0; way < assoc; ++way) {
+ int x = set*assoc + way;
+ freeList[set].push_back(&tlb[x]);
+ }
+ }
+
+ FA = (size == assoc);
+
+ /**
+ * @warning: the set-associative version assumes you have a
+ * fixed page size of 4KB.
+ * If the page size is greather than 4KB (as defined in the
+ * TheISA::PageBytes), then there are various issues w/ the current
+ * implementation (you'd have the same 8KB page being replicated in
+ * different sets etc)
+ */
+ setMask = numSets - 1;
+
+ #if 0
+ // GpuTLB doesn't yet support full system
+ walker = p->walker;
+ walker->setTLB(this);
+ #endif
+
+ maxCoalescedReqs = p->maxOutstandingReqs;
+
+ // Do not allow maxCoalescedReqs to be more than the TLB associativity
+ if (maxCoalescedReqs > assoc) {
+ maxCoalescedReqs = assoc;
+ cprintf("Forcing maxCoalescedReqs to %d (TLB assoc.) \n", assoc);
+ }
+
+ outstandingReqs = 0;
+ hitLatency = p->hitLatency;
+ missLatency1 = p->missLatency1;
+ missLatency2 = p->missLatency2;
+
+ // create the slave ports based on the number of connected ports
+ for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
+ cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d",
+ name(), i), this, i));
+ }
+
+ // create the master ports based on the number of connected ports
+ for (size_t i = 0; i < p->port_master_connection_count; ++i) {
+ memSidePort.push_back(new MemSidePort(csprintf("%s-port%d",
+ name(), i), this, i));
+ }
+ }
+
+ // fixme: this is never called?
+ GpuTLB::~GpuTLB()
+ {
+ // make sure all the hash-maps are empty
+ assert(translationReturnEvent.empty());
+
+ // delete the TLB
+ delete[] tlb;
+ }
+
+ BaseSlavePort&
+ GpuTLB::getSlavePort(const std::string &if_name, PortID idx)
+ {
+ if (if_name == "slave") {
+ if (idx >= static_cast<PortID>(cpuSidePort.size())) {
+ panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx);
+ }
+
+ return *cpuSidePort[idx];
+ } else {
+ panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name);
+ }
+ }
+
+ BaseMasterPort&
+ GpuTLB::getMasterPort(const std::string &if_name, PortID idx)
+ {
+ if (if_name == "master") {
+ if (idx >= static_cast<PortID>(memSidePort.size())) {
+ panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx);
+ }
+
+ hasMemSidePort = true;
+
+ return *memSidePort[idx];
+ } else {
+ panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name);
+ }
+ }
+
+ GpuTlbEntry*
+ GpuTLB::insert(Addr vpn, GpuTlbEntry &entry)
+ {
+ GpuTlbEntry *newEntry = nullptr;
+
+ /**
+ * vpn holds the virtual page address
+ * The least significant bits are simply masked
+ */
+ int set = (vpn >> TheISA::PageShift) & setMask;
+
+ if (!freeList[set].empty()) {
+ newEntry = freeList[set].front();
+ freeList[set].pop_front();
+ } else {
+ newEntry = entryList[set].back();
+ entryList[set].pop_back();
+ }
+
+ *newEntry = entry;
+ newEntry->vaddr = vpn;
+ entryList[set].push_front(newEntry);
+
+ return newEntry;
+ }
+
+ GpuTLB::EntryList::iterator
+ GpuTLB::lookupIt(Addr va, bool update_lru)
+ {
+ int set = (va >> TheISA::PageShift) & setMask;
+
+ if (FA) {
+ assert(!set);
+ }
+
+ auto entry = entryList[set].begin();
+ for (; entry != entryList[set].end(); ++entry) {
+ int page_size = (*entry)->size();
+
+ if ((*entry)->vaddr <= va && (*entry)->vaddr + page_size > va) {
+ DPRINTF(GPUTLB, "Matched vaddr %#x to entry starting at %#x "
+ "with size %#x.\n", va, (*entry)->vaddr, page_size);
+
+ if (update_lru) {
+ entryList[set].push_front(*entry);
+ entryList[set].erase(entry);
+ entry = entryList[set].begin();
+ }
+
+ break;
+ }
+ }
+
+ return entry;
+ }
+
+ GpuTlbEntry*
+ GpuTLB::lookup(Addr va, bool update_lru)
+ {
+ int set = (va >> TheISA::PageShift) & setMask;
+
+ auto entry = lookupIt(va, update_lru);
+
+ if (entry == entryList[set].end())
+ return nullptr;
+ else
+ return *entry;
+ }
+
+ void
+ GpuTLB::invalidateAll()
+ {
+ DPRINTF(GPUTLB, "Invalidating all entries.\n");
+
+ for (int i = 0; i < numSets; ++i) {
+ while (!entryList[i].empty()) {
+ GpuTlbEntry *entry = entryList[i].front();
+ entryList[i].pop_front();
+ freeList[i].push_back(entry);
+ }
+ }
+ }
+
+ void
+ GpuTLB::setConfigAddress(uint32_t addr)
+ {
+ configAddress = addr;
+ }
+
+ void
+ GpuTLB::invalidateNonGlobal()
+ {
+ DPRINTF(GPUTLB, "Invalidating all non global entries.\n");
+
+ for (int i = 0; i < numSets; ++i) {
+ for (auto entryIt = entryList[i].begin();
+ entryIt != entryList[i].end();) {
+ if (!(*entryIt)->global) {
+ freeList[i].push_back(*entryIt);
+ entryList[i].erase(entryIt++);
+ } else {
+ ++entryIt;
+ }
+ }
+ }
+ }
+
+ void
+ GpuTLB::demapPage(Addr va, uint64_t asn)
+ {
+
+ int set = (va >> TheISA::PageShift) & setMask;
+ auto entry = lookupIt(va, false);
+
+ if (entry != entryList[set].end()) {
+ freeList[set].push_back(*entry);
+ entryList[set].erase(entry);
+ }
+ }
+
+ Fault
+ GpuTLB::translateInt(RequestPtr req, ThreadContext *tc)
+ {
+ DPRINTF(GPUTLB, "Addresses references internal memory.\n");
+ Addr vaddr = req->getVaddr();
+ Addr prefix = (vaddr >> 3) & IntAddrPrefixMask;
+
+ if (prefix == IntAddrPrefixCPUID) {
+ panic("CPUID memory space not yet implemented!\n");
+ } else if (prefix == IntAddrPrefixMSR) {
+ vaddr = vaddr >> 3;
+ req->setFlags(Request::MMAPPED_IPR);
+ Addr regNum = 0;
+
+ switch (vaddr & ~IntAddrPrefixMask) {
+ case 0x10:
+ regNum = MISCREG_TSC;
+ break;
+ case 0x1B:
+ regNum = MISCREG_APIC_BASE;
+ break;
+ case 0xFE:
+ regNum = MISCREG_MTRRCAP;
+ break;
+ case 0x174:
+ regNum = MISCREG_SYSENTER_CS;
+ break;
+ case 0x175:
+ regNum = MISCREG_SYSENTER_ESP;
+ break;
+ case 0x176:
+ regNum = MISCREG_SYSENTER_EIP;
+ break;
+ case 0x179:
+ regNum = MISCREG_MCG_CAP;
+ break;
+ case 0x17A:
+ regNum = MISCREG_MCG_STATUS;
+ break;
+ case 0x17B:
+ regNum = MISCREG_MCG_CTL;
+ break;
+ case 0x1D9:
+ regNum = MISCREG_DEBUG_CTL_MSR;
+ break;
+ case 0x1DB:
+ regNum = MISCREG_LAST_BRANCH_FROM_IP;
+ break;
+ case 0x1DC:
+ regNum = MISCREG_LAST_BRANCH_TO_IP;
+ break;
+ case 0x1DD:
+ regNum = MISCREG_LAST_EXCEPTION_FROM_IP;
+ break;
+ case 0x1DE:
+ regNum = MISCREG_LAST_EXCEPTION_TO_IP;
+ break;
+ case 0x200:
+ regNum = MISCREG_MTRR_PHYS_BASE_0;
+ break;
+ case 0x201:
+ regNum = MISCREG_MTRR_PHYS_MASK_0;
+ break;
+ case 0x202:
+ regNum = MISCREG_MTRR_PHYS_BASE_1;
+ break;
+ case 0x203:
+ regNum = MISCREG_MTRR_PHYS_MASK_1;
+ break;
+ case 0x204:
+ regNum = MISCREG_MTRR_PHYS_BASE_2;
+ break;
+ case 0x205:
+ regNum = MISCREG_MTRR_PHYS_MASK_2;
+ break;
+ case 0x206:
+ regNum = MISCREG_MTRR_PHYS_BASE_3;
+ break;
+ case 0x207:
+ regNum = MISCREG_MTRR_PHYS_MASK_3;
+ break;
+ case 0x208:
+ regNum = MISCREG_MTRR_PHYS_BASE_4;
+ break;
+ case 0x209:
+ regNum = MISCREG_MTRR_PHYS_MASK_4;
+ break;
+ case 0x20A:
+ regNum = MISCREG_MTRR_PHYS_BASE_5;
+ break;
+ case 0x20B:
+ regNum = MISCREG_MTRR_PHYS_MASK_5;
+ break;
+ case 0x20C:
+ regNum = MISCREG_MTRR_PHYS_BASE_6;
+ break;
+ case 0x20D:
+ regNum = MISCREG_MTRR_PHYS_MASK_6;
+ break;
+ case 0x20E:
+ regNum = MISCREG_MTRR_PHYS_BASE_7;
+ break;
+ case 0x20F:
+ regNum = MISCREG_MTRR_PHYS_MASK_7;
+ break;
+ case 0x250:
+ regNum = MISCREG_MTRR_FIX_64K_00000;
+ break;
+ case 0x258:
+ regNum = MISCREG_MTRR_FIX_16K_80000;
+ break;
+ case 0x259:
+ regNum = MISCREG_MTRR_FIX_16K_A0000;
+ break;
+ case 0x268:
+ regNum = MISCREG_MTRR_FIX_4K_C0000;
+ break;
+ case 0x269:
+ regNum = MISCREG_MTRR_FIX_4K_C8000;
+ break;
+ case 0x26A:
+ regNum = MISCREG_MTRR_FIX_4K_D0000;
+ break;
+ case 0x26B:
+ regNum = MISCREG_MTRR_FIX_4K_D8000;
+ break;
+ case 0x26C:
+ regNum = MISCREG_MTRR_FIX_4K_E0000;
+ break;
+ case 0x26D:
+ regNum = MISCREG_MTRR_FIX_4K_E8000;
+ break;
+ case 0x26E:
+ regNum = MISCREG_MTRR_FIX_4K_F0000;
+ break;
+ case 0x26F:
+ regNum = MISCREG_MTRR_FIX_4K_F8000;
+ break;
+ case 0x277:
+ regNum = MISCREG_PAT;
+ break;
+ case 0x2FF:
+ regNum = MISCREG_DEF_TYPE;
+ break;
+ case 0x400:
+ regNum = MISCREG_MC0_CTL;
+ break;
+ case 0x404:
+ regNum = MISCREG_MC1_CTL;
+ break;
+ case 0x408:
+ regNum = MISCREG_MC2_CTL;
+ break;
+ case 0x40C:
+ regNum = MISCREG_MC3_CTL;
+ break;
+ case 0x410:
+ regNum = MISCREG_MC4_CTL;
+ break;
+ case 0x414:
+ regNum = MISCREG_MC5_CTL;
+ break;
+ case 0x418:
+ regNum = MISCREG_MC6_CTL;
+ break;
+ case 0x41C:
+ regNum = MISCREG_MC7_CTL;
+ break;
+ case 0x401:
+ regNum = MISCREG_MC0_STATUS;
+ break;
+ case 0x405:
+ regNum = MISCREG_MC1_STATUS;
+ break;
+ case 0x409:
+ regNum = MISCREG_MC2_STATUS;
+ break;
+ case 0x40D:
+ regNum = MISCREG_MC3_STATUS;
+ break;
+ case 0x411:
+ regNum = MISCREG_MC4_STATUS;
+ break;
+ case 0x415:
+ regNum = MISCREG_MC5_STATUS;
+ break;
+ case 0x419:
+ regNum = MISCREG_MC6_STATUS;
+ break;
+ case 0x41D:
+ regNum = MISCREG_MC7_STATUS;
+ break;
+ case 0x402:
+ regNum = MISCREG_MC0_ADDR;
+ break;
+ case 0x406:
+ regNum = MISCREG_MC1_ADDR;
+ break;
+ case 0x40A:
+ regNum = MISCREG_MC2_ADDR;
+ break;
+ case 0x40E:
+ regNum = MISCREG_MC3_ADDR;
+ break;
+ case 0x412:
+ regNum = MISCREG_MC4_ADDR;
+ break;
+ case 0x416:
+ regNum = MISCREG_MC5_ADDR;
+ break;
+ case 0x41A:
+ regNum = MISCREG_MC6_ADDR;
+ break;
+ case 0x41E:
+ regNum = MISCREG_MC7_ADDR;
+ break;
+ case 0x403:
+ regNum = MISCREG_MC0_MISC;
+ break;
+ case 0x407:
+ regNum = MISCREG_MC1_MISC;
+ break;
+ case 0x40B:
+ regNum = MISCREG_MC2_MISC;
+ break;
+ case 0x40F:
+ regNum = MISCREG_MC3_MISC;
+ break;
+ case 0x413:
+ regNum = MISCREG_MC4_MISC;
+ break;
+ case 0x417:
+ regNum = MISCREG_MC5_MISC;
+ break;
+ case 0x41B:
+ regNum = MISCREG_MC6_MISC;
+ break;
+ case 0x41F:
+ regNum = MISCREG_MC7_MISC;
+ break;
+ case 0xC0000080:
+ regNum = MISCREG_EFER;
+ break;
+ case 0xC0000081:
+ regNum = MISCREG_STAR;
+ break;
+ case 0xC0000082:
+ regNum = MISCREG_LSTAR;
+ break;
+ case 0xC0000083:
+ regNum = MISCREG_CSTAR;
+ break;
+ case 0xC0000084:
+ regNum = MISCREG_SF_MASK;
+ break;
+ case 0xC0000100:
+ regNum = MISCREG_FS_BASE;
+ break;
+ case 0xC0000101:
+ regNum = MISCREG_GS_BASE;
+ break;
+ case 0xC0000102:
+ regNum = MISCREG_KERNEL_GS_BASE;
+ break;
+ case 0xC0000103:
+ regNum = MISCREG_TSC_AUX;
+ break;
+ case 0xC0010000:
+ regNum = MISCREG_PERF_EVT_SEL0;
+ break;
+ case 0xC0010001:
+ regNum = MISCREG_PERF_EVT_SEL1;
+ break;
+ case 0xC0010002:
+ regNum = MISCREG_PERF_EVT_SEL2;
+ break;
+ case 0xC0010003:
+ regNum = MISCREG_PERF_EVT_SEL3;
+ break;
+ case 0xC0010004:
+ regNum = MISCREG_PERF_EVT_CTR0;
+ break;
+ case 0xC0010005:
+ regNum = MISCREG_PERF_EVT_CTR1;
+ break;
+ case 0xC0010006:
+ regNum = MISCREG_PERF_EVT_CTR2;
+ break;
+ case 0xC0010007:
+ regNum = MISCREG_PERF_EVT_CTR3;
+ break;
+ case 0xC0010010:
+ regNum = MISCREG_SYSCFG;
+ break;
+ case 0xC0010016:
+ regNum = MISCREG_IORR_BASE0;
+ break;
+ case 0xC0010017:
+ regNum = MISCREG_IORR_BASE1;
+ break;
+ case 0xC0010018:
+ regNum = MISCREG_IORR_MASK0;
+ break;
+ case 0xC0010019:
+ regNum = MISCREG_IORR_MASK1;
+ break;
+ case 0xC001001A:
+ regNum = MISCREG_TOP_MEM;
+ break;
+ case 0xC001001D:
+ regNum = MISCREG_TOP_MEM2;
+ break;
+ case 0xC0010114:
+ regNum = MISCREG_VM_CR;
+ break;
+ case 0xC0010115:
+ regNum = MISCREG_IGNNE;
+ break;
+ case 0xC0010116:
+ regNum = MISCREG_SMM_CTL;
+ break;
+ case 0xC0010117:
+ regNum = MISCREG_VM_HSAVE_PA;
+ break;
+ default:
+ return std::make_shared<GeneralProtection>(0);
+ }
+ //The index is multiplied by the size of a MiscReg so that
+ //any memory dependence calculations will not see these as
+ //overlapping.
+ req->setPaddr(regNum * sizeof(MiscReg));
+ return NoFault;
+ } else if (prefix == IntAddrPrefixIO) {
+ // TODO If CPL > IOPL or in virtual mode, check the I/O permission
+ // bitmap in the TSS.
+
+ Addr IOPort = vaddr & ~IntAddrPrefixMask;
+ // Make sure the address fits in the expected 16 bit IO address
+ // space.
+ assert(!(IOPort & ~0xFFFF));
+
+ if (IOPort == 0xCF8 && req->getSize() == 4) {
+ req->setFlags(Request::MMAPPED_IPR);
+ req->setPaddr(MISCREG_PCI_CONFIG_ADDRESS * sizeof(MiscReg));
+ } else if ((IOPort & ~mask(2)) == 0xCFC) {
+ req->setFlags(Request::UNCACHEABLE);
+
+ Addr configAddress =
+ tc->readMiscRegNoEffect(MISCREG_PCI_CONFIG_ADDRESS);
+
+ if (bits(configAddress, 31, 31)) {
+ req->setPaddr(PhysAddrPrefixPciConfig |
+ mbits(configAddress, 30, 2) |
+ (IOPort & mask(2)));
+ } else {
+ req->setPaddr(PhysAddrPrefixIO | IOPort);
+ }
+ } else {
+ req->setFlags(Request::UNCACHEABLE);
+ req->setPaddr(PhysAddrPrefixIO | IOPort);
+ }
+ return NoFault;
+ } else {
+ panic("Access to unrecognized internal address space %#x.\n",
+ prefix);
+ }
+ }
+
+ /**
+ * TLB_lookup will only perform a TLB lookup returning true on a TLB hit
+ * and false on a TLB miss.
+ * Many of the checks about different modes have been converted to
+ * assertions, since these parts of the code are not really used.
+ * On a hit it will update the LRU stack.
+ */
+ bool
+ GpuTLB::tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats)
+ {
+ bool tlb_hit = false;
+ #ifndef NDEBUG
+ uint32_t flags = req->getFlags();
+ int seg = flags & SegmentFlagMask;
+ #endif
+
+ assert(seg != SEGMENT_REG_MS);
+ Addr vaddr = req->getVaddr();
+ DPRINTF(GPUTLB, "TLB Lookup for vaddr %#x.\n", vaddr);
+ HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
+
+ if (m5Reg.prot) {
+ DPRINTF(GPUTLB, "In protected mode.\n");
+ // make sure we are in 64-bit mode
+ assert(m5Reg.mode == LongMode);
+
+ // If paging is enabled, do the translation.
+ if (m5Reg.paging) {
+ DPRINTF(GPUTLB, "Paging enabled.\n");
+ //update LRU stack on a hit
+ GpuTlbEntry *entry = lookup(vaddr, true);
+
+ if (entry)
+ tlb_hit = true;
+
+ if (!update_stats) {
+ // functional tlb access for memory initialization
+ // i.e., memory seeding or instr. seeding -> don't update
+ // TLB and stats
+ return tlb_hit;
+ }
+
+ localNumTLBAccesses++;
+
+ if (!entry) {
+ localNumTLBMisses++;
+ } else {
+ localNumTLBHits++;
+ }
+ }
+ }
+
+ return tlb_hit;
+ }
+
+ Fault
+ GpuTLB::translate(RequestPtr req, ThreadContext *tc,
+ Translation *translation, Mode mode,
+ bool &delayedResponse, bool timing, int &latency)
+ {
+ uint32_t flags = req->getFlags();
+ int seg = flags & SegmentFlagMask;
+ bool storeCheck = flags & (StoreCheck << FlagShift);
+
+ // If this is true, we're dealing with a request
+ // to a non-memory address space.
+ if (seg == SEGMENT_REG_MS) {
+ return translateInt(req, tc);
+ }
+
+ delayedResponse = false;
+ Addr vaddr = req->getVaddr();
+ DPRINTF(GPUTLB, "Translating vaddr %#x.\n", vaddr);
+
+ HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
+
+ // If protected mode has been enabled...
+ if (m5Reg.prot) {
+ DPRINTF(GPUTLB, "In protected mode.\n");
+ // If we're not in 64-bit mode, do protection/limit checks
+ if (m5Reg.mode != LongMode) {
+ DPRINTF(GPUTLB, "Not in long mode. Checking segment "
+ "protection.\n");
+
+ // Check for a null segment selector.
+ if (!(seg == SEGMENT_REG_TSG || seg == SYS_SEGMENT_REG_IDTR ||
+ seg == SEGMENT_REG_HS || seg == SEGMENT_REG_LS)
+ && !tc->readMiscRegNoEffect(MISCREG_SEG_SEL(seg))) {
+ return std::make_shared<GeneralProtection>(0);
+ }
+
+ bool expandDown = false;
+ SegAttr attr = tc->readMiscRegNoEffect(MISCREG_SEG_ATTR(seg));
+
+ if (seg >= SEGMENT_REG_ES && seg <= SEGMENT_REG_HS) {
+ if (!attr.writable && (mode == BaseTLB::Write ||
+ storeCheck))
+ return std::make_shared<GeneralProtection>(0);
+
+ if (!attr.readable && mode == BaseTLB::Read)
+ return std::make_shared<GeneralProtection>(0);
+
+ expandDown = attr.expandDown;
+
+ }
+
+ Addr base = tc->readMiscRegNoEffect(MISCREG_SEG_BASE(seg));
+ Addr limit = tc->readMiscRegNoEffect(MISCREG_SEG_LIMIT(seg));
+ // This assumes we're not in 64 bit mode. If we were, the
+ // default address size is 64 bits, overridable to 32.
+ int size = 32;
+ bool sizeOverride = (flags & (AddrSizeFlagBit << FlagShift));
+ SegAttr csAttr = tc->readMiscRegNoEffect(MISCREG_CS_ATTR);
+
+ if ((csAttr.defaultSize && sizeOverride) ||
+ (!csAttr.defaultSize && !sizeOverride)) {
+ size = 16;
+ }
+
+ Addr offset = bits(vaddr - base, size - 1, 0);
+ Addr endOffset = offset + req->getSize() - 1;
+
+ if (expandDown) {
+ DPRINTF(GPUTLB, "Checking an expand down segment.\n");
+ warn_once("Expand down segments are untested.\n");
+
+ if (offset <= limit || endOffset <= limit)
+ return std::make_shared<GeneralProtection>(0);
+ } else {
+ if (offset > limit || endOffset > limit)
+ return std::make_shared<GeneralProtection>(0);
+ }
+ }
+
+ // If paging is enabled, do the translation.
+ if (m5Reg.paging) {
+ DPRINTF(GPUTLB, "Paging enabled.\n");
+ // The vaddr already has the segment base applied.
+ GpuTlbEntry *entry = lookup(vaddr);
+ localNumTLBAccesses++;
+
+ if (!entry) {
+ localNumTLBMisses++;
+ if (timing) {
+ latency = missLatency1;
+ }
+
+ if (FullSystem) {
+ fatal("GpuTLB doesn't support full-system mode\n");
+ } else {
+ DPRINTF(GPUTLB, "Handling a TLB miss for address %#x "
+ "at pc %#x.\n", vaddr, tc->instAddr());
+
+ Process *p = tc->getProcessPtr();
+ GpuTlbEntry newEntry;
+ bool success = p->pTable->lookup(vaddr, newEntry);
+
+ if (!success && mode != BaseTLB::Execute) {
+ // penalize a "page fault" more
+ if (timing) {
+ latency += missLatency2;
+ }
+
+ if (p->fixupStackFault(vaddr))
+ success = p->pTable->lookup(vaddr, newEntry);
+ }
+
+ if (!success) {
+ return std::make_shared<PageFault>(vaddr, true,
+ mode, true,
+ false);
+ } else {
+ newEntry.valid = success;
+ Addr alignedVaddr = p->pTable->pageAlign(vaddr);
+
+ DPRINTF(GPUTLB, "Mapping %#x to %#x\n",
+ alignedVaddr, newEntry.pageStart());
+
+ entry = insert(alignedVaddr, newEntry);
+ }
+
+ DPRINTF(GPUTLB, "Miss was serviced.\n");
+ }
+ } else {
+ localNumTLBHits++;
+
+ if (timing) {
+ latency = hitLatency;
+ }
+ }
+
+ // Do paging protection checks.
+ bool inUser = (m5Reg.cpl == 3 &&
+ !(flags & (CPL0FlagBit << FlagShift)));
+
+ CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
+ bool badWrite = (!entry->writable && (inUser || cr0.wp));
+
+ if ((inUser && !entry->user) || (mode == BaseTLB::Write &&
+ badWrite)) {
+ // The page must have been present to get into the TLB in
+ // the first place. We'll assume the reserved bits are
+ // fine even though we're not checking them.
+ return std::make_shared<PageFault>(vaddr, true, mode,
+ inUser, false);
+ }
+
+ if (storeCheck && badWrite) {
+ // This would fault if this were a write, so return a page
+ // fault that reflects that happening.
+ return std::make_shared<PageFault>(vaddr, true,
+ BaseTLB::Write,
+ inUser, false);
+ }
+
+
+ DPRINTF(GPUTLB, "Entry found with paddr %#x, doing protection "
+ "checks.\n", entry->paddr);
+
+ int page_size = entry->size();
+ Addr paddr = entry->paddr | (vaddr & (page_size - 1));
+ DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
+ req->setPaddr(paddr);
+
+ if (entry->uncacheable)
+ req->setFlags(Request::UNCACHEABLE);
+ } else {
+ //Use the address which already has segmentation applied.
+ DPRINTF(GPUTLB, "Paging disabled.\n");
+ DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
+ req->setPaddr(vaddr);
+ }
+ } else {
+ // Real mode
+ DPRINTF(GPUTLB, "In real mode.\n");
+ DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
+ req->setPaddr(vaddr);
+ }
+
+ // Check for an access to the local APIC
+ if (FullSystem) {
+ LocalApicBase localApicBase =
+ tc->readMiscRegNoEffect(MISCREG_APIC_BASE);
+
+ Addr baseAddr = localApicBase.base * PageBytes;
+ Addr paddr = req->getPaddr();
+
+ if (baseAddr <= paddr && baseAddr + PageBytes > paddr) {
+ // Force the access to be uncacheable.
+ req->setFlags(Request::UNCACHEABLE);
+ req->setPaddr(x86LocalAPICAddress(tc->contextId(),
+ paddr - baseAddr));
+ }
+ }
+
+ return NoFault;
+ };
+
+ Fault
+ GpuTLB::translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
+ int &latency)
+ {
+ bool delayedResponse;
+
+ return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false,
+ latency);
+ }
+
+ void
+ GpuTLB::translateTiming(RequestPtr req, ThreadContext *tc,
+ Translation *translation, Mode mode, int &latency)
+ {
+ bool delayedResponse;
+ assert(translation);
+
+ Fault fault = GpuTLB::translate(req, tc, translation, mode,
+ delayedResponse, true, latency);
+
+ if (!delayedResponse)
+ translation->finish(fault, req, tc, mode);
+ }
+
+ Walker*
+ GpuTLB::getWalker()
+ {
+ return walker;
+ }
+
+
+ void
+ GpuTLB::serialize(CheckpointOut &cp) const
+ {
+ }
+
+ void
+ GpuTLB::unserialize(CheckpointIn &cp)
+ {
+ }
+
+ void
+ GpuTLB::regStats()
+ {
+ localNumTLBAccesses
+ .name(name() + ".local_TLB_accesses")
+ .desc("Number of TLB accesses")
+ ;
+
+ localNumTLBHits
+ .name(name() + ".local_TLB_hits")
+ .desc("Number of TLB hits")
+ ;
+
+ localNumTLBMisses
+ .name(name() + ".local_TLB_misses")
+ .desc("Number of TLB misses")
+ ;
+
+ localTLBMissRate
+ .name(name() + ".local_TLB_miss_rate")
+ .desc("TLB miss rate")
+ ;
+
+ accessCycles
+ .name(name() + ".access_cycles")
+ .desc("Cycles spent accessing this TLB level")
+ ;
+
+ pageTableCycles
+ .name(name() + ".page_table_cycles")
+ .desc("Cycles spent accessing the page table")
+ ;
+
+ localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
+
+ numUniquePages
+ .name(name() + ".unique_pages")
+ .desc("Number of unique pages touched")
+ ;
+
+ localCycles
+ .name(name() + ".local_cycles")
+ .desc("Number of cycles spent in queue for all incoming reqs")
+ ;
+
+ localLatency
+ .name(name() + ".local_latency")
+ .desc("Avg. latency over incoming coalesced reqs")
+ ;
+
+ localLatency = localCycles / localNumTLBAccesses;
+
+ globalNumTLBAccesses
+ .name(name() + ".global_TLB_accesses")
+ .desc("Number of TLB accesses")
+ ;
+
+ globalNumTLBHits
+ .name(name() + ".global_TLB_hits")
+ .desc("Number of TLB hits")
+ ;
+
+ globalNumTLBMisses
+ .name(name() + ".global_TLB_misses")
+ .desc("Number of TLB misses")
+ ;
+
+ globalTLBMissRate
+ .name(name() + ".global_TLB_miss_rate")
+ .desc("TLB miss rate")
+ ;
+
+ globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
+
+ avgReuseDistance
+ .name(name() + ".avg_reuse_distance")
+ .desc("avg. reuse distance over all pages (in ticks)")
+ ;
+
+ }
+
+ /**
+ * Do the TLB lookup for this coalesced request and schedule
+ * another event <TLB access latency> cycles later.
+ */
+
+ void
+ GpuTLB::issueTLBLookup(PacketPtr pkt)
+ {
+ assert(pkt);
+ assert(pkt->senderState);
+
+ Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
+ TheISA::PageBytes);
+
+ TranslationState *sender_state =
+ safe_cast<TranslationState*>(pkt->senderState);
+
+ bool update_stats = !sender_state->prefetch;
+ ThreadContext * tmp_tc = sender_state->tc;
+
+ DPRINTF(GPUTLB, "Translation req. for virt. page addr %#x\n",
+ virt_page_addr);
+
+ int req_cnt = sender_state->reqCnt.back();
+
+ if (update_stats) {
+ accessCycles -= (curTick() * req_cnt);
+ localCycles -= curTick();
+ updatePageFootprint(virt_page_addr);
+ globalNumTLBAccesses += req_cnt;
+ }
+
+ tlbOutcome lookup_outcome = TLB_MISS;
+ RequestPtr tmp_req = pkt->req;
+
+ // Access the TLB and figure out if it's a hit or a miss.
+ bool success = tlbLookup(tmp_req, tmp_tc, update_stats);
+
+ if (success) {
+ lookup_outcome = TLB_HIT;
+ // Put the entry in SenderState
+ GpuTlbEntry *entry = lookup(tmp_req->getVaddr(), false);
+ assert(entry);
+
+ sender_state->tlbEntry =
+ new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid);
+
+ if (update_stats) {
+ // the reqCnt has an entry per level, so its size tells us
+ // which level we are in
+ sender_state->hitLevel = sender_state->reqCnt.size();
+ globalNumTLBHits += req_cnt;
+ }
+ } else {
+ if (update_stats)
+ globalNumTLBMisses += req_cnt;
+ }
+
+ /*
+ * We now know the TLB lookup outcome (if it's a hit or a miss), as well
+ * as the TLB access latency.
+ *
+ * We create and schedule a new TLBEvent which will help us take the
+ * appropriate actions (e.g., update TLB on a hit, send request to lower
+ * level TLB on a miss, or start a page walk if this was the last-level
+ * TLB)
+ */
+ TLBEvent *tlb_event =
+ new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);
+
+ if (translationReturnEvent.count(virt_page_addr)) {
+ panic("Virtual Page Address %#x already has a return event\n",
+ virt_page_addr);
+ }
+
+ translationReturnEvent[virt_page_addr] = tlb_event;
+ assert(tlb_event);
+
+ DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",
+ curTick() + this->ticks(hitLatency));
+
+ schedule(tlb_event, curTick() + this->ticks(hitLatency));
+ }
+
+ GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome,
+ PacketPtr _pkt)
+ : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
+ outcome(tlb_outcome), pkt(_pkt)
+ {
+ }
+
+ /**
+ * Do Paging protection checks. If we encounter a page fault, then
+ * an assertion is fired.
+ */
+ void
+ GpuTLB::pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
+ GpuTlbEntry * tlb_entry, Mode mode)
+ {
+ HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
+ uint32_t flags = pkt->req->getFlags();
+ bool storeCheck = flags & (StoreCheck << FlagShift);
+
+ // Do paging protection checks.
+ bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
+ CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
+
+ bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp));
+
+ if ((inUser && !tlb_entry->user) ||
+ (mode == BaseTLB::Write && badWrite)) {
+ // The page must have been present to get into the TLB in
+ // the first place. We'll assume the reserved bits are
+ // fine even though we're not checking them.
+ assert(false);
+ }
+
+ if (storeCheck && badWrite) {
+ // This would fault if this were a write, so return a page
+ // fault that reflects that happening.
+ assert(false);
+ }
+ }
+
+ /**
+ * handleTranslationReturn is called on a TLB hit,
+ * when a TLB miss returns or when a page fault returns.
+ * The latter calls handelHit with TLB miss as tlbOutcome.
+ */
+ void
+ GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome,
+ PacketPtr pkt)
+ {
+
+ assert(pkt);
+ Addr vaddr = pkt->req->getVaddr();
+
+ TranslationState *sender_state =
+ safe_cast<TranslationState*>(pkt->senderState);
+
+ ThreadContext *tc = sender_state->tc;
+ Mode mode = sender_state->tlbMode;
+
+ GpuTlbEntry *local_entry, *new_entry;
+
+ if (tlb_outcome == TLB_HIT) {
+ DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr);
+ local_entry = sender_state->tlbEntry;
+ } else {
+ DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
+ vaddr);
+
+ // We are returning either from a page walk or from a hit at a lower
+ // TLB level. The senderState should be "carrying" a pointer to the
+ // correct TLBEntry.
+ new_entry = sender_state->tlbEntry;
+ assert(new_entry);
+ local_entry = new_entry;
+
+ if (allocationPolicy) {
+ DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
+ virt_page_addr);
+
+ local_entry = insert(virt_page_addr, *new_entry);
+ }
+
+ assert(local_entry);
+ }
+
+ /**
+ * At this point the packet carries an up-to-date tlbEntry pointer
+ * in its senderState.
+ * Next step is to do the paging protection checks.
+ */
+ DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
+ "while paddr was %#x.\n", local_entry->vaddr,
+ local_entry->paddr);
+
+ pagingProtectionChecks(tc, pkt, local_entry, mode);
+ int page_size = local_entry->size();
+ Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
+ DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
+
+ // Since this packet will be sent through the cpu side slave port,
+ // it must be converted to a response pkt if it is not one already
+ if (pkt->isRequest()) {
+ pkt->makeTimingResponse();
+ }
+
+ pkt->req->setPaddr(paddr);
+
+ if (local_entry->uncacheable) {
+ pkt->req->setFlags(Request::UNCACHEABLE);
+ }
+
+ //send packet back to coalescer
+ cpuSidePort[0]->sendTimingResp(pkt);
+ //schedule cleanup event
+ cleanupQueue.push(virt_page_addr);
+
+ // schedule this only once per cycle.
+ // The check is required because we might have multiple translations
+ // returning the same cycle
+ // this is a maximum priority event and must be on the same cycle
+ // as the cleanup event in TLBCoalescer to avoid a race with
+ // IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry
+ if (!cleanupEvent.scheduled())
+ schedule(cleanupEvent, curTick());
+ }
+
+ /**
+ * Here we take the appropriate actions based on the result of the
+ * TLB lookup.
+ */
+ void
+ GpuTLB::translationReturn(Addr virtPageAddr, tlbOutcome outcome,
+ PacketPtr pkt)
+ {
+ DPRINTF(GPUTLB, "Triggered TLBEvent for addr %#x\n", virtPageAddr);
+
+ assert(translationReturnEvent[virtPageAddr]);
+ assert(pkt);
+
+ TranslationState *tmp_sender_state =
+ safe_cast<TranslationState*>(pkt->senderState);
+
+ int req_cnt = tmp_sender_state->reqCnt.back();
+ bool update_stats = !tmp_sender_state->prefetch;
+
+
+ if (outcome == TLB_HIT) {
+ handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);
+
+ if (update_stats) {
+ accessCycles += (req_cnt * curTick());
+ localCycles += curTick();
+ }
+
+ } else if (outcome == TLB_MISS) {
+
+ DPRINTF(GPUTLB, "This is a TLB miss\n");
+ if (update_stats) {
+ accessCycles += (req_cnt*curTick());
+ localCycles += curTick();
+ }
+
+ if (hasMemSidePort) {
+ // the one cyle added here represent the delay from when we get
+ // the reply back till when we propagate it to the coalescer
+ // above.
+ if (update_stats) {
+ accessCycles += (req_cnt * 1);
+ localCycles += 1;
+ }
+
+ /**
+ * There is a TLB below. Send the coalesced request.
+ * We actually send the very first packet of all the
+ * pending packets for this virtual page address.
+ */
+ if (!memSidePort[0]->sendTimingReq(pkt)) {
+ DPRINTF(GPUTLB, "Failed sending translation request to "
+ "lower level TLB for addr %#x\n", virtPageAddr);
+
+ memSidePort[0]->retries.push_back(pkt);
+ } else {
+ DPRINTF(GPUTLB, "Sent translation request to lower level "
+ "TLB for addr %#x\n", virtPageAddr);
+ }
+ } else {
+ //this is the last level TLB. Start a page walk
+ DPRINTF(GPUTLB, "Last level TLB - start a page walk for "
+ "addr %#x\n", virtPageAddr);
+
+ if (update_stats)
+ pageTableCycles -= (req_cnt*curTick());
+
+ TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
+ assert(tlb_event);
+ tlb_event->updateOutcome(PAGE_WALK);
+ schedule(tlb_event, curTick() + ticks(missLatency2));
+ }
+ } else if (outcome == PAGE_WALK) {
+ if (update_stats)
+ pageTableCycles += (req_cnt*curTick());
+
+ // Need to access the page table and update the TLB
+ DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
+ virtPageAddr);
+
+ TranslationState *sender_state =
+ safe_cast<TranslationState*>(pkt->senderState);
+
+ Process *p = sender_state->tc->getProcessPtr();
+ TlbEntry newEntry;
+ Addr vaddr = pkt->req->getVaddr();
+ #ifndef NDEBUG
+ Addr alignedVaddr = p->pTable->pageAlign(vaddr);
+ assert(alignedVaddr == virtPageAddr);
+ #endif
+ bool success;
+ success = p->pTable->lookup(vaddr, newEntry);
+ if (!success && sender_state->tlbMode != BaseTLB::Execute) {
+ if (p->fixupStackFault(vaddr)) {
+ success = p->pTable->lookup(vaddr, newEntry);
+ }
+ }
+
+ DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
+ newEntry.pageStart());
+
+ sender_state->tlbEntry =
+ new GpuTlbEntry(0, newEntry.vaddr, newEntry.paddr, success);
+
+ handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
+ } else if (outcome == MISS_RETURN) {
+ /** we add an extra cycle in the return path of the translation
+ * requests in between the various TLB levels.
+ */
+ handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
+ } else {
+ assert(false);
+ }
+ }
+
+ void
+ GpuTLB::TLBEvent::process()
+ {
+ tlb->translationReturn(virtPageAddr, outcome, pkt);
+ }
+
+ const char*
+ GpuTLB::TLBEvent::description() const
+ {
+ return "trigger translationDoneEvent";
+ }
+
+ void
+ GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome)
+ {
+ outcome = _outcome;
+ }
+
+ Addr
+ GpuTLB::TLBEvent::getTLBEventVaddr()
+ {
+ return virtPageAddr;
+ }
+
+ /*
+ * recvTiming receives a coalesced timing request from a TLBCoalescer
+ * and it calls issueTLBLookup()
+ * It only rejects the packet if we have exceeded the max
+ * outstanding number of requests for the TLB
+ */
+ bool
+ GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt)
+ {
+ if (tlb->outstandingReqs < tlb->maxCoalescedReqs) {
+ tlb->issueTLBLookup(pkt);
+ // update number of outstanding translation requests
+ tlb->outstandingReqs++;
+ return true;
+ } else {
+ DPRINTF(GPUTLB, "Reached maxCoalescedReqs number %d\n",
+ tlb->outstandingReqs);
+ return false;
+ }
+ }
+
+ /**
+ * handleFuncTranslationReturn is called on a TLB hit,
+ * when a TLB miss returns or when a page fault returns.
+ * It updates LRU, inserts the TLB entry on a miss
+ * depending on the allocation policy and does the required
+ * protection checks. It does NOT create a new packet to
+ * update the packet's addr; this is done in hsail-gpu code.
+ */
+ void
+ GpuTLB::handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome tlb_outcome)
+ {
+ TranslationState *sender_state =
+ safe_cast<TranslationState*>(pkt->senderState);
+
+ ThreadContext *tc = sender_state->tc;
+ Mode mode = sender_state->tlbMode;
+ Addr vaddr = pkt->req->getVaddr();
+
+ GpuTlbEntry *local_entry, *new_entry;
+
+ if (tlb_outcome == TLB_HIT) {
+ DPRINTF(GPUTLB, "Functional Translation Done - TLB hit for addr "
+ "%#x\n", vaddr);
+
+ local_entry = sender_state->tlbEntry;
+ } else {
+ DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
+ "%#x\n", vaddr);
+
+ // We are returning either from a page walk or from a hit at a lower
+ // TLB level. The senderState should be "carrying" a pointer to the
+ // correct TLBEntry.
+ new_entry = sender_state->tlbEntry;
+ assert(new_entry);
+ local_entry = new_entry;
+
+ if (allocationPolicy) {
+ Addr virt_page_addr = roundDown(vaddr, TheISA::PageBytes);
+
+ DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
+ virt_page_addr);
+
+ local_entry = insert(virt_page_addr, *new_entry);
+ }
+
+ assert(local_entry);
+ }
+
+ DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
+ "while paddr was %#x.\n", local_entry->vaddr,
+ local_entry->paddr);
+
+ // Do paging checks if it's a normal functional access. If it's for a
+ // prefetch, then sometimes you can try to prefetch something that won't
+ // pass protection. We don't actually want to fault becuase there is no
+ // demand access to deem this a violation. Just put it in the TLB and
+ // it will fault if indeed a future demand access touches it in
+ // violation.
+ if (!sender_state->prefetch && sender_state->tlbEntry->valid)
+ pagingProtectionChecks(tc, pkt, local_entry, mode);
+
+ int page_size = local_entry->size();
+ Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
+ DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
+
+ pkt->req->setPaddr(paddr);
+
+ if (local_entry->uncacheable)
+ pkt->req->setFlags(Request::UNCACHEABLE);
+ }
+
+ // This is used for atomic translations. Need to
+ // make it all happen during the same cycle.
+ void
+ GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt)
+ {
+ TranslationState *sender_state =
+ safe_cast<TranslationState*>(pkt->senderState);
+
+ ThreadContext *tc = sender_state->tc;
+ bool update_stats = !sender_state->prefetch;
+
+ Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
+ TheISA::PageBytes);
+
+ if (update_stats)
+ tlb->updatePageFootprint(virt_page_addr);
+
+ // do the TLB lookup without updating the stats
+ bool success = tlb->tlbLookup(pkt->req, tc, update_stats);
+ tlbOutcome tlb_outcome = success ? TLB_HIT : TLB_MISS;
+
+ // functional mode means no coalescing
+ // global metrics are the same as the local metrics
+ if (update_stats) {
+ tlb->globalNumTLBAccesses++;
+
+ if (success) {
+ sender_state->hitLevel = sender_state->reqCnt.size();
+ tlb->globalNumTLBHits++;
+ }
+ }
+
+ if (!success) {
+ if (update_stats)
+ tlb->globalNumTLBMisses++;
+ if (tlb->hasMemSidePort) {
+ // there is a TLB below -> propagate down the TLB hierarchy
+ tlb->memSidePort[0]->sendFunctional(pkt);
+ // If no valid translation from a prefetch, then just return
+ if (sender_state->prefetch && !pkt->req->hasPaddr())
+ return;
+ } else {
+ // Need to access the page table and update the TLB
+ DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
+ virt_page_addr);
+
+ Process *p = tc->getProcessPtr();
+ TlbEntry newEntry;
+
+ Addr vaddr = pkt->req->getVaddr();
+ #ifndef NDEBUG
+ Addr alignedVaddr = p->pTable->pageAlign(vaddr);
+ assert(alignedVaddr == virt_page_addr);
+ #endif
+
+ bool success = p->pTable->lookup(vaddr, newEntry);
+ if (!success && sender_state->tlbMode != BaseTLB::Execute) {
+ if (p->fixupStackFault(vaddr))
+ success = p->pTable->lookup(vaddr, newEntry);
+ }
+
+ if (!sender_state->prefetch) {
+ // no PageFaults are permitted after
+ // the second page table lookup
+ assert(success);
+
+ DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
+ newEntry.pageStart());
+
+ sender_state->tlbEntry = new GpuTlbEntry(0, newEntry.vaddr,
+ newEntry.paddr,
+ success);
+ } else {
+ // If this was a prefetch, then do the normal thing if it
+ // was a successful translation. Otherwise, send an empty
+ // TLB entry back so that it can be figured out as empty and
+ // handled accordingly.
+ if (success) {
+ DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
+ newEntry.pageStart());
+
+ sender_state->tlbEntry = new GpuTlbEntry(0,
+ newEntry.vaddr,
+ newEntry.paddr,
+ success);
+ } else {
+ DPRINTF(GPUPrefetch, "Prefetch failed %#x\n",
+ alignedVaddr);
+
+ sender_state->tlbEntry = new GpuTlbEntry();
+
+ return;
+ }
+ }
+ }
+ } else {
+ DPRINTF(GPUPrefetch, "Functional Hit for vaddr %#x\n",
+ tlb->lookup(pkt->req->getVaddr()));
+
+ GpuTlbEntry *entry = tlb->lookup(pkt->req->getVaddr(),
+ update_stats);
+
+ assert(entry);
+
+ sender_state->tlbEntry =
+ new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid);
+ }
+ // This is the function that would populate pkt->req with the paddr of
+ // the translation. But if no translation happens (i.e Prefetch fails)
+ // then the early returns in the above code wiill keep this function
+ // from executing.
+ tlb->handleFuncTranslationReturn(pkt, tlb_outcome);
+ }
+
+ void
+ GpuTLB::CpuSidePort::recvReqRetry()
+ {
+ // The CPUSidePort never sends anything but replies. No retries
+ // expected.
+ assert(false);
+ }
+
+ AddrRangeList
+ GpuTLB::CpuSidePort::getAddrRanges() const
+ {
+ // currently not checked by the master
+ AddrRangeList ranges;
+
+ return ranges;
+ }
+
+ /**
+ * MemSidePort receives the packet back.
+ * We need to call the handleTranslationReturn
+ * and propagate up the hierarchy.
+ */
+ bool
+ GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt)
+ {
+ Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
+ TheISA::PageBytes);
+
+ DPRINTF(GPUTLB, "MemSidePort recvTiming for virt_page_addr %#x\n",
+ virt_page_addr);
+
+ TLBEvent *tlb_event = tlb->translationReturnEvent[virt_page_addr];
+ assert(tlb_event);
+ assert(virt_page_addr == tlb_event->getTLBEventVaddr());
+
+ tlb_event->updateOutcome(MISS_RETURN);
+ tlb->schedule(tlb_event, curTick()+tlb->ticks(1));
+
+ return true;
+ }
+
+ void
+ GpuTLB::MemSidePort::recvReqRetry()
+ {
+ // No retries should reach the TLB. The retries
+ // should only reach the TLBCoalescer.
+ assert(false);
+ }
+
+ void
+ GpuTLB::cleanup()
+ {
+ while (!cleanupQueue.empty()) {
+ Addr cleanup_addr = cleanupQueue.front();
+ cleanupQueue.pop();
+
+ // delete TLBEvent
+ TLBEvent * old_tlb_event = translationReturnEvent[cleanup_addr];
+ delete old_tlb_event;
+ translationReturnEvent.erase(cleanup_addr);
+
+ // update number of outstanding requests
+ outstandingReqs--;
+ }
+
+ /** the higher level coalescer should retry if it has
+ * any pending requests.
+ */
+ for (int i = 0; i < cpuSidePort.size(); ++i) {
+ cpuSidePort[i]->sendRetryReq();
+ }
+ }
+
+ void
+ GpuTLB::updatePageFootprint(Addr virt_page_addr)
+ {
+
+ std::pair<AccessPatternTable::iterator, bool> ret;
+
+ AccessInfo tmp_access_info;
+ tmp_access_info.lastTimeAccessed = 0;
+ tmp_access_info.accessesPerPage = 0;
+ tmp_access_info.totalReuseDistance = 0;
+ tmp_access_info.sumDistance = 0;
+ tmp_access_info.meanDistance = 0;
+
+ ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr,
+ tmp_access_info));
+
+ bool first_page_access = ret.second;
+
+ if (first_page_access) {
+ numUniquePages++;
+ } else {
+ int accessed_before;
+ accessed_before = curTick() - ret.first->second.lastTimeAccessed;
+ ret.first->second.totalReuseDistance += accessed_before;
+ }
+
+ ret.first->second.accessesPerPage++;
+ ret.first->second.lastTimeAccessed = curTick();
+
+ if (accessDistance) {
+ ret.first->second.localTLBAccesses
+ .push_back(localNumTLBAccesses.value());
+ }
+ }
+
+ void
+ GpuTLB::exitCallback()
+ {
+ std::ostream *page_stat_file = nullptr;
+
+ if (accessDistance) {
+
+ // print per page statistics to a separate file (.csv format)
+ // simout is the gem5 output directory (default is m5out or the one
+ // specified with -d
+ page_stat_file = simout.create(name().c_str());
+
+ // print header
+ *page_stat_file << "page,max_access_distance,mean_access_distance, "
+ << "stddev_distance" << std::endl;
+ }
+
+ // update avg. reuse distance footprint
+ AccessPatternTable::iterator iter, iter_begin, iter_end;
+ unsigned int sum_avg_reuse_distance_per_page = 0;
+
+ // iterate through all pages seen by this TLB
+ for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) {
+ sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance /
+ iter->second.accessesPerPage;
+
+ if (accessDistance) {
+ unsigned int tmp = iter->second.localTLBAccesses[0];
+ unsigned int prev = tmp;
+
+ for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
+ if (i) {
+ tmp = prev + 1;
+ }
+
+ prev = iter->second.localTLBAccesses[i];
+ // update the localTLBAccesses value
+ // with the actual differece
+ iter->second.localTLBAccesses[i] -= tmp;
+ // compute the sum of AccessDistance per page
+ // used later for mean
+ iter->second.sumDistance +=
+ iter->second.localTLBAccesses[i];
+ }
+
+ iter->second.meanDistance =
+ iter->second.sumDistance / iter->second.accessesPerPage;
+
+ // compute std_dev and max (we need a second round because we
+ // need to know the mean value
+ unsigned int max_distance = 0;
+ unsigned int stddev_distance = 0;
+
+ for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
+ unsigned int tmp_access_distance =
+ iter->second.localTLBAccesses[i];
+
+ if (tmp_access_distance > max_distance) {
+ max_distance = tmp_access_distance;
+ }
+
+ unsigned int diff =
+ tmp_access_distance - iter->second.meanDistance;
+ stddev_distance += pow(diff, 2);
+
+ }
+
+ stddev_distance =
+ sqrt(stddev_distance/iter->second.accessesPerPage);
+
+ if (page_stat_file) {
+ *page_stat_file << std::hex << iter->first << ",";
+ *page_stat_file << std::dec << max_distance << ",";
+ *page_stat_file << std::dec << iter->second.meanDistance
+ << ",";
+ *page_stat_file << std::dec << stddev_distance;
+ *page_stat_file << std::endl;
+ }
+
+ // erase the localTLBAccesses array
+ iter->second.localTLBAccesses.clear();
+ }
+ }
+
+ if (!TLBFootprint.empty()) {
+ avgReuseDistance =
+ sum_avg_reuse_distance_per_page / TLBFootprint.size();
+ }
+
+ //clear the TLBFootprint map
+ TLBFootprint.clear();
+ }
+} // namespace X86ISA
+
+X86ISA::GpuTLB*
+X86GPUTLBParams::create()
+{
+ return new X86ISA::GpuTLB(this);
+}
+
diff --git a/src/gpu-compute/gpu_tlb.hh b/src/gpu-compute/gpu_tlb.hh
new file mode 100644
index 000000000..3549c598b
--- /dev/null
+++ b/src/gpu-compute/gpu_tlb.hh
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#ifndef __GPU_TLB_HH__
+#define __GPU_TLB_HH__
+
+#include <fstream>
+#include <list>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "arch/generic/tlb.hh"
+#include "arch/x86/pagetable.hh"
+#include "arch/x86/pagetable_walker.hh"
+#include "arch/x86/regs/segment.hh"
+#include "base/callback.hh"
+#include "base/misc.hh"
+#include "base/statistics.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/X86GPUTLB.hh"
+#include "sim/sim_object.hh"
+
+class BaseTLB;
+class Packet;
+class ThreadContext;
+
+namespace X86ISA
+{
+ class GpuTlbEntry : public TlbEntry
+ {
+ public:
+ GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid)
+ : TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { }
+
+ GpuTlbEntry() : TlbEntry() { }
+
+ bool valid;
+ };
+
+ class GpuTLB : public MemObject
+ {
+ protected:
+ friend class Walker;
+
+ typedef std::list<GpuTlbEntry*> EntryList;
+
+ uint32_t configAddress;
+
+ // TLB clock: will inherit clock from shader's clock period in terms
+ // of nuber of ticks of curTime (aka global simulation clock)
+ // The assignment of TLB clock from shader clock is done in the python
+ // config files.
+ int clock;
+
+ public:
+ // clock related functions ; maps to-and-from Simulation ticks and
+ // object clocks.
+ Tick frequency() const { return SimClock::Frequency / clock; }
+
+ Tick
+ ticks(int numCycles) const
+ {
+ return (Tick)clock * numCycles;
+ }
+
+ Tick curCycle() const { return curTick() / clock; }
+ Tick tickToCycles(Tick val) const { return val / clock;}
+
+ typedef X86GPUTLBParams Params;
+ GpuTLB(const Params *p);
+ ~GpuTLB();
+
+ typedef enum BaseTLB::Mode Mode;
+
+ class Translation
+ {
+ public:
+ virtual ~Translation() { }
+
+ /**
+ * Signal that the translation has been delayed due to a hw page
+ * table walk.
+ */
+ virtual void markDelayed() = 0;
+
+ /**
+ * The memory for this object may be dynamically allocated, and it
+ * may be responsible for cleaning itslef up which will happen in
+ * this function. Once it's called the object is no longer valid.
+ */
+ virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc,
+ Mode mode) = 0;
+ };
+
+ void dumpAll();
+ GpuTlbEntry *lookup(Addr va, bool update_lru=true);
+ void setConfigAddress(uint32_t addr);
+
+ protected:
+ EntryList::iterator lookupIt(Addr va, bool update_lru=true);
+ Walker *walker;
+
+ public:
+ Walker *getWalker();
+ void invalidateAll();
+ void invalidateNonGlobal();
+ void demapPage(Addr va, uint64_t asn);
+
+ protected:
+ int size;
+ int assoc;
+ int numSets;
+
+ /**
+ * true if this is a fully-associative TLB
+ */
+ bool FA;
+ Addr setMask;
+
+ /**
+ * Allocation Policy: true if we always allocate on a hit, false
+ * otherwise. Default is true.
+ */
+ bool allocationPolicy;
+
+ /**
+ * if true, then this is not the last level TLB
+ */
+ bool hasMemSidePort;
+
+ /**
+ * Print out accessDistance stats. One stat file
+ * per TLB.
+ */
+ bool accessDistance;
+
+ GpuTlbEntry *tlb;
+
+ /*
+ * It's a per-set list. As long as we have not reached
+ * the full capacity of the given set, grab an entry from
+ * the freeList.
+ */
+ std::vector<EntryList> freeList;
+
+ /**
+ * An entryList per set is the equivalent of an LRU stack;
+ * it's used to guide replacement decisions. The head of the list
+ * contains the MRU TLB entry of the given set. If the freeList
+ * for this set is empty, the last element of the list
+ * is evicted (i.e., dropped on the floor).
+ */
+ std::vector<EntryList> entryList;
+
+ Fault translateInt(RequestPtr req, ThreadContext *tc);
+
+ Fault translate(RequestPtr req, ThreadContext *tc,
+ Translation *translation, Mode mode, bool &delayedResponse,
+ bool timing, int &latency);
+
+ public:
+ // latencies for a TLB hit, miss and page fault
+ int hitLatency;
+ int missLatency1;
+ int missLatency2;
+
+ // local_stats are as seen from the TLB
+ // without taking into account coalescing
+ Stats::Scalar localNumTLBAccesses;
+ Stats::Scalar localNumTLBHits;
+ Stats::Scalar localNumTLBMisses;
+ Stats::Formula localTLBMissRate;
+
+ // global_stats are as seen from the
+ // CU's perspective taking into account
+ // all coalesced requests.
+ Stats::Scalar globalNumTLBAccesses;
+ Stats::Scalar globalNumTLBHits;
+ Stats::Scalar globalNumTLBMisses;
+ Stats::Formula globalTLBMissRate;
+
+ // from the CU perspective (global)
+ Stats::Scalar accessCycles;
+ // from the CU perspective (global)
+ Stats::Scalar pageTableCycles;
+ Stats::Scalar numUniquePages;
+ // from the perspective of this TLB
+ Stats::Scalar localCycles;
+ // from the perspective of this TLB
+ Stats::Formula localLatency;
+ // I take the avg. per page and then
+ // the avg. over all pages.
+ Stats::Scalar avgReuseDistance;
+
+ void regStats();
+ void updatePageFootprint(Addr virt_page_addr);
+ void printAccessPattern();
+
+
+ Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
+ int &latency);
+
+ void translateTiming(RequestPtr req, ThreadContext *tc,
+ Translation *translation, Mode mode,
+ int &latency);
+
+ Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
+ Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
+
+ GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry);
+
+ // Checkpointing
+ virtual void serialize(CheckpointOut& cp) const;
+ virtual void unserialize(CheckpointIn& cp);
+ void issueTranslation();
+ enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
+ bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats);
+
+ void handleTranslationReturn(Addr addr, tlbOutcome outcome,
+ PacketPtr pkt);
+
+ void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
+
+ void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
+ GpuTlbEntry *tlb_entry, Mode mode);
+
+ void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry,
+ Addr phys_page_addr);
+
+ void issueTLBLookup(PacketPtr pkt);
+
+ // CpuSidePort is the TLB Port closer to the CPU/CU side
+ class CpuSidePort : public SlavePort
+ {
+ public:
+ CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
+ PortID _index)
+ : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
+
+ protected:
+ GpuTLB *tlb;
+ int index;
+
+ virtual bool recvTimingReq(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt);
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+ virtual void recvRespRetry() { assert(false); }
+ virtual AddrRangeList getAddrRanges() const;
+ };
+
+ /**
+ * MemSidePort is the TLB Port closer to the memory side
+ * If this is a last level TLB then this port will not be connected.
+ *
+ * Future action item: if we ever do real page walks, then this port
+ * should be connected to a RubyPort.
+ */
+ class MemSidePort : public MasterPort
+ {
+ public:
+ MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
+ PortID _index)
+ : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
+
+ std::deque<PacketPtr> retries;
+
+ protected:
+ GpuTLB *tlb;
+ int index;
+
+ virtual bool recvTimingResp(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt) { }
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+ };
+
+ // TLB ports on the cpu Side
+ std::vector<CpuSidePort*> cpuSidePort;
+ // TLB ports on the memory side
+ std::vector<MemSidePort*> memSidePort;
+
+ BaseMasterPort &getMasterPort(const std::string &if_name,
+ PortID idx=InvalidPortID);
+
+ BaseSlavePort &getSlavePort(const std::string &if_name,
+ PortID idx=InvalidPortID);
+
+ /**
+ * TLB TranslationState: this currently is a somewhat bastardization of
+ * the usage of SenderState, whereby the receiver of a packet is not
+ * usually supposed to need to look at the contents of the senderState,
+ * you're really only supposed to look at what you pushed on, pop it
+ * off, and send it back.
+ *
+ * However, since there is state that we want to pass to the TLBs using
+ * the send/recv Timing/Functional/etc. APIs, which don't allow for new
+ * arguments, we need a common TLB senderState to pass between TLBs,
+ * both "forwards" and "backwards."
+ *
+ * So, basically, the rule is that any packet received by a TLB port
+ * (cpuside OR memside) must be safely castable to a TranslationState.
+ */
+
+ struct TranslationState : public Packet::SenderState
+ {
+ // TLB mode, read or write
+ Mode tlbMode;
+ // Thread context associated with this req
+ ThreadContext *tc;
+
+ /*
+ * TLB entry to be populated and passed back and filled in
+ * previous TLBs. Equivalent to the data cache concept of
+ * "data return."
+ */
+ GpuTlbEntry *tlbEntry;
+ // Is this a TLB prefetch request?
+ bool prefetch;
+ // When was the req for this translation issued
+ uint64_t issueTime;
+ // Remember where this came from
+ std::vector<SlavePort*>ports;
+
+ // keep track of #uncoalesced reqs per packet per TLB level;
+ // reqCnt per level >= reqCnt higher level
+ std::vector<int> reqCnt;
+ // TLB level this packet hit in; 0 if it hit in the page table
+ int hitLevel;
+ Packet::SenderState *saved;
+
+ TranslationState(Mode tlb_mode, ThreadContext *_tc,
+ bool _prefetch=false,
+ Packet::SenderState *_saved=nullptr)
+ : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
+ prefetch(_prefetch), issueTime(0),
+ hitLevel(0),saved(_saved) { }
+ };
+
+ // maximum number of permitted coalesced requests per cycle
+ int maxCoalescedReqs;
+
+ // Current number of outstandings coalesced requests.
+ // Should be <= maxCoalescedReqs
+ int outstandingReqs;
+
+ /**
+ * A TLBEvent is scheduled after the TLB lookup and helps us take the
+ * appropriate actions:
+ * (e.g., update TLB on a hit,
+ * send request to lower level TLB on a miss,
+ * or start a page walk if this was the last-level TLB).
+ */
+ void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
+ PacketPtr pkt);
+
+ class TLBEvent : public Event
+ {
+ private:
+ GpuTLB *tlb;
+ Addr virtPageAddr;
+ /**
+ * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
+ */
+ tlbOutcome outcome;
+ PacketPtr pkt;
+
+ public:
+ TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
+ PacketPtr _pkt);
+
+ void process();
+ const char *description() const;
+
+ // updateOutcome updates the tlbOutcome of a TLBEvent
+ void updateOutcome(tlbOutcome _outcome);
+ Addr getTLBEventVaddr();
+ };
+
+ std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
+
+ // this FIFO queue keeps track of the virt. page addresses
+ // that are pending cleanup
+ std::queue<Addr> cleanupQueue;
+
+ // the cleanupEvent is scheduled after a TLBEvent triggers in order to
+ // free memory and do the required clean-up
+ void cleanup();
+
+ EventWrapper<GpuTLB, &GpuTLB::cleanup> cleanupEvent;
+
+ /**
+ * This hash map will use the virtual page address as a key
+ * and will keep track of total number of accesses per page
+ */
+
+ struct AccessInfo
+ {
+ unsigned int lastTimeAccessed; // last access to this page
+ unsigned int accessesPerPage;
+ // need to divide it by accessesPerPage at the end
+ unsigned int totalReuseDistance;
+
+ /**
+ * The field below will help us compute the access distance,
+ * that is the number of (coalesced) TLB accesses that
+ * happened in between each access to this page
+ *
+ * localTLBAccesses[x] is the value of localTLBNumAccesses
+ * when the page <Addr> was accessed for the <x>th time
+ */
+ std::vector<unsigned int> localTLBAccesses;
+ unsigned int sumDistance;
+ unsigned int meanDistance;
+ };
+
+ typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
+ AccessPatternTable TLBFootprint;
+
+ // Called at the end of simulation to dump page access stats.
+ void exitCallback();
+
+ EventWrapper<GpuTLB, &GpuTLB::exitCallback> exitEvent;
+ };
+}
+
+#endif // __GPU_TLB_HH__
diff --git a/src/gpu-compute/hsa_code.hh b/src/gpu-compute/hsa_code.hh
new file mode 100644
index 000000000..9f358e23c
--- /dev/null
+++ b/src/gpu-compute/hsa_code.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __HSA_CODE_HH__
+#define __HSA_CODE_HH__
+
+#include <string>
+#include <vector>
+
+#include "arch/gpu_types.hh"
+#include "config/the_gpu_isa.hh"
+
+class HsaKernelInfo;
+
+/* @class HsaCode
+ * base code object for the set of HSA kernels associated
+ * with a single application. this class provides the common
+ * methods for creating, accessing, and storing information
+ * about kernel and variable symbols, symbol name, memory
+ * segment sizes, and instruction count, etc.
+ */
+
+class HsaCode
+{
+ public:
+ HsaCode(const std::string &name) : readonly_data(nullptr), funcarg_size(0),
+ _name(name)
+ {
+ }
+
+ enum class MemorySegment {
+ NONE,
+ FLAT,
+ GLOBAL,
+ READONLY,
+ KERNARG,
+ GROUP,
+ PRIVATE,
+ SPILL,
+ ARG,
+ EXTSPACE0
+ };
+
+ const std::string& name() const { return _name; }
+ int numInsts() const { return _insts.size(); }
+ std::vector<TheGpuISA::RawMachInst>* insts() { return &_insts; }
+
+ void
+ setReadonlyData(uint8_t *_readonly_data)
+ {
+ readonly_data = _readonly_data;
+ }
+
+ virtual int getSize(MemorySegment segment) const = 0;
+ virtual void generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const = 0;
+
+ uint8_t *readonly_data;
+ int funcarg_size;
+
+ protected:
+ // An array that stores instruction indices (0 through kernel size)
+ // for a kernel passed to code object constructor as an argument.
+ std::vector<TheGpuISA::RawMachInst> _insts;
+
+ private:
+ const std::string _name;
+};
+
+#endif // __HSA_CODE_HH__
diff --git a/src/gpu-compute/hsa_kernel_info.hh b/src/gpu-compute/hsa_kernel_info.hh
new file mode 100644
index 000000000..396913dac
--- /dev/null
+++ b/src/gpu-compute/hsa_kernel_info.hh
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __HSA_KERNEL_INFO_HH__
+#define __HSA_KERNEL_INFO_HH__
+
+// This file defines the public interface between the HSA emulated
+// driver and application programs.
+
+#include <cstdint>
+
+static const int HSA_GET_SIZES = 0x4801;
+static const int HSA_GET_KINFO = 0x4802;
+static const int HSA_GET_STRINGS = 0x4803;
+static const int HSA_GET_CODE = 0x4804;
+static const int HSA_GET_READONLY_DATA = 0x4805;
+static const int HSA_GET_CU_CNT = 0x4806;
+static const int HSA_GET_VSZ = 0x4807;
+
+// Return value (via buffer ptr) for HSA_GET_SIZES
+struct HsaDriverSizes
+{
+ uint32_t num_kernels;
+ uint32_t string_table_size;
+ uint32_t code_size;
+ uint32_t readonly_size;
+};
+
+// HSA_GET_KINFO returns an array of num_kernels of these structs
+struct HsaKernelInfo
+{
+ // byte offset into string table
+ uint32_t name_offs;
+ // byte offset into code array
+ uint32_t code_offs;
+ uint32_t static_lds_size;
+ uint32_t private_mem_size;
+ uint32_t spill_mem_size;
+ // Number of s registers
+ uint32_t sRegCount;
+ // Number of d registers
+ uint32_t dRegCount;
+ // Number of c registers
+ uint32_t cRegCount;
+};
+
+#endif // __HSA_KERNEL_INFO_HH__
diff --git a/src/gpu-compute/hsa_object.cc b/src/gpu-compute/hsa_object.cc
new file mode 100644
index 000000000..91dfb160e
--- /dev/null
+++ b/src/gpu-compute/hsa_object.cc
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/hsa_object.hh"
+
+#include <fstream>
+
+#include "gpu-compute/brig_object.hh"
+
+HsaObject::HsaObject(const std::string &fname)
+ : readonlyData(nullptr), filename(fname)
+{
+}
+
+HsaObject*
+HsaObject::createHsaObject(const std::string &fname)
+{
+ HsaObject *hsaObj = nullptr;
+ uint8_t *file_data = nullptr;
+ int file_length = 0;
+
+ std::ifstream code_file(fname, std::ifstream::ate | std::ifstream::in |
+ std::ifstream::binary);
+
+ assert(code_file.is_open());
+ assert(code_file.good());
+
+ file_length = code_file.tellg();
+ code_file.seekg(0, code_file.beg);
+ file_data = new uint8_t[file_length];
+ code_file.read((char*)file_data, file_length);
+ code_file.close();
+
+ for (const auto &tryFile : tryFileFuncs) {
+ if ((hsaObj = tryFile(fname, file_length, file_data))) {
+ return hsaObj;
+ }
+ }
+
+ delete[] file_data;
+ fatal("Unknown HSA object type for file: %s.\n", fname);
+
+ return nullptr;
+}
diff --git a/src/gpu-compute/hsa_object.hh b/src/gpu-compute/hsa_object.hh
new file mode 100644
index 000000000..1f08f5d80
--- /dev/null
+++ b/src/gpu-compute/hsa_object.hh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __HSA_OBJECT_HH__
+#define __HSA_OBJECT_HH__
+
+#include <functional>
+#include <string>
+#include <vector>
+
+class HsaCode;
+
+/* @class HsaObject
+ * base loader object for HSA kernels. this class provides
+ * the base method definitions for loading, storing, and
+ * accessing HSA kernel objects into the simulator.
+ */
+
+class HsaObject
+{
+ public:
+ HsaObject(const std::string &fileName);
+
+ static HsaObject* createHsaObject(const std::string &fname);
+ static std::vector<std::function<HsaObject*(const std::string&, int,
+ uint8_t*)>> tryFileFuncs;
+
+ virtual HsaCode* getKernel(const std::string &name) const = 0;
+ virtual HsaCode* getKernel(int i) const = 0;
+ virtual HsaCode* getFunction(const std::string &name) const = 0;
+ virtual int numKernels() const = 0;
+
+ const std::string& name() const { return filename; }
+
+ uint8_t *readonlyData;
+
+
+ protected:
+ const std::string filename;
+};
+
+#endif // __HSA_OBJECT_HH__
diff --git a/src/gpu-compute/hsail_code.cc b/src/gpu-compute/hsail_code.cc
new file mode 100644
index 000000000..b0ddf0161
--- /dev/null
+++ b/src/gpu-compute/hsail_code.cc
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "gpu-compute/hsail_code.hh"
+
+#include "arch/gpu_types.hh"
+#include "arch/hsail/Brig.h"
+#include "arch/hsail/operand.hh"
+#include "config/the_gpu_isa.hh"
+#include "debug/BRIG.hh"
+#include "debug/HSAILObject.hh"
+#include "gpu-compute/brig_object.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/kernel_cfg.hh"
+
+using namespace Brig;
+
+int getBrigDataTypeBytes(BrigType16_t t);
+
+HsailCode::HsailCode(const std::string &name_str)
+ : HsaCode(name_str), private_size(-1), readonly_size(-1)
+{
+}
+
+void
+HsailCode::init(const BrigDirectiveExecutable *code_dir, const BrigObject *obj,
+ StorageMap *objStorageMap)
+{
+ storageMap = objStorageMap;
+
+ // set pointer so that decoding process can find this kernel context when
+ // needed
+ obj->currentCode = this;
+
+ if (code_dir->base.kind != BRIG_KIND_DIRECTIVE_FUNCTION &&
+ code_dir->base.kind != BRIG_KIND_DIRECTIVE_KERNEL) {
+ fatal("unexpected directive kind %d inside kernel/function init\n",
+ code_dir->base.kind);
+ }
+
+ DPRINTF(HSAILObject, "Initializing code, first code block entry is: %d\n",
+ code_dir->firstCodeBlockEntry);
+
+ // clear these static vars so we can properly track the max index
+ // for this kernel
+ SRegOperand::maxRegIdx = 0;
+ DRegOperand::maxRegIdx = 0;
+ CRegOperand::maxRegIdx = 0;
+ setPrivateSize(0);
+
+ const BrigBase *entryPtr = brigNext((BrigBase*)code_dir);
+ const BrigBase *endPtr =
+ obj->getCodeSectionEntry(code_dir->nextModuleEntry);
+
+ int inst_idx = 0;
+ std::vector<GPUStaticInst*> instructions;
+ int funcarg_size_scope = 0;
+
+ // walk through instructions in code section and directives in
+ // directive section in parallel, processing directives that apply
+ // when we reach the relevant code point.
+ while (entryPtr < endPtr) {
+ switch (entryPtr->kind) {
+ case BRIG_KIND_DIRECTIVE_VARIABLE:
+ {
+ const BrigDirectiveVariable *sym =
+ (const BrigDirectiveVariable*)entryPtr;
+
+ DPRINTF(HSAILObject,"Initializing code, directive is "
+ "kind_variable, symbol is: %s\n",
+ obj->getString(sym->name));
+
+ StorageElement *se = storageMap->addSymbol(sym, obj);
+
+ if (sym->segment == BRIG_SEGMENT_PRIVATE) {
+ setPrivateSize(se->size);
+ } else { // spill
+ funcarg_size_scope += se->size;
+ }
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_LABEL:
+ {
+ const BrigDirectiveLabel *lbl =
+ (const BrigDirectiveLabel*)entryPtr;
+
+ DPRINTF(HSAILObject,"Initializing code, directive is "
+ "kind_label, label is: %s \n",
+ obj->getString(lbl->name));
+
+ labelMap.addLabel(lbl, inst_idx, obj);
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_PRAGMA:
+ {
+ DPRINTF(HSAILObject, "Initializing code, directive "
+ "is kind_pragma\n");
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_COMMENT:
+ {
+ DPRINTF(HSAILObject, "Initializing code, directive is "
+ "kind_comment\n");
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START:
+ {
+ DPRINTF(HSAILObject, "Initializing code, directive is "
+ "kind_arg_block_start\n");
+
+ storageMap->resetOffset(BRIG_SEGMENT_ARG);
+ funcarg_size_scope = 0;
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END:
+ {
+ DPRINTF(HSAILObject, "Initializing code, directive is "
+ "kind_arg_block_end\n");
+
+ funcarg_size = funcarg_size < funcarg_size_scope ?
+ funcarg_size_scope : funcarg_size;
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_END:
+ DPRINTF(HSAILObject, "Initializing code, dircetive is "
+ "kind_end\n");
+
+ break;
+
+ default:
+ if (entryPtr->kind >= BRIG_KIND_INST_BEGIN &&
+ entryPtr->kind <= BRIG_KIND_INST_END) {
+
+ BrigInstBase *instPtr = (BrigInstBase*)entryPtr;
+ TheGpuISA::MachInst machInst = { instPtr, obj };
+ GPUStaticInst *iptr = decoder.decode(machInst);
+
+ if (iptr) {
+ DPRINTF(HSAILObject, "Initializing code, processing inst "
+ "#%d idx %d: OPCODE=%d\n",
+ inst_idx, _insts.size(), instPtr->opcode);
+
+ TheGpuISA::RawMachInst inst_num = decoder.saveInst(iptr);
+ iptr->instNum(inst_idx);
+ _insts.push_back(inst_num);
+ instructions.push_back(iptr);
+ }
+ ++inst_idx;
+ } else if (entryPtr->kind >= BRIG_KIND_OPERAND_BEGIN &&
+ entryPtr->kind < BRIG_KIND_OPERAND_END) {
+ warn("unexpected operand entry in code segment\n");
+ } else {
+ // there are surely some more cases we will need to handle,
+ // but we'll deal with them as we find them.
+ fatal("unexpected directive kind %d inside kernel scope\n",
+ entryPtr->kind);
+ }
+ }
+
+ entryPtr = brigNext(entryPtr);
+ }
+
+ // compute Control Flow Graph for current kernel
+ ControlFlowInfo::assignImmediatePostDominators(instructions);
+
+ max_sreg = SRegOperand::maxRegIdx;
+ max_dreg = DRegOperand::maxRegIdx;
+ max_creg = CRegOperand::maxRegIdx;
+
+ obj->currentCode = nullptr;
+}
+
+HsailCode::HsailCode(const std::string &name_str,
+ const BrigDirectiveExecutable *code_dir,
+ const BrigObject *obj, StorageMap *objStorageMap)
+ : HsaCode(name_str), private_size(-1), readonly_size(-1)
+{
+ init(code_dir, obj, objStorageMap);
+}
+
+void
+LabelMap::addLabel(const Brig::BrigDirectiveLabel *lblDir, int inst_index,
+ const BrigObject *obj)
+{
+ std::string lbl_name = obj->getString(lblDir->name);
+ Label &lbl = map[lbl_name];
+
+ if (lbl.defined()) {
+ fatal("Attempt to redefine existing label %s\n", lbl_name);
+ }
+
+ lbl.define(lbl_name, inst_index);
+ DPRINTF(HSAILObject, "label %s = %d\n", lbl_name, inst_index);
+}
+
+Label*
+LabelMap::refLabel(const Brig::BrigDirectiveLabel *lblDir,
+ const BrigObject *obj)
+{
+ std::string name = obj->getString(lblDir->name);
+ Label &lbl = map[name];
+ lbl.checkName(name);
+
+ return &lbl;
+}
+
+int
+getBrigDataTypeBytes(BrigType16_t t)
+{
+ switch (t) {
+ case BRIG_TYPE_S8:
+ case BRIG_TYPE_U8:
+ case BRIG_TYPE_B8:
+ return 1;
+
+ case BRIG_TYPE_S16:
+ case BRIG_TYPE_U16:
+ case BRIG_TYPE_B16:
+ case BRIG_TYPE_F16:
+ return 2;
+
+ case BRIG_TYPE_S32:
+ case BRIG_TYPE_U32:
+ case BRIG_TYPE_B32:
+ case BRIG_TYPE_F32:
+ return 4;
+
+ case BRIG_TYPE_S64:
+ case BRIG_TYPE_U64:
+ case BRIG_TYPE_B64:
+ case BRIG_TYPE_F64:
+ return 8;
+
+ case BRIG_TYPE_B1:
+
+ default:
+ fatal("unhandled symbol data type %d", t);
+ return 0;
+ }
+}
+
+StorageElement*
+StorageSpace::addSymbol(const BrigDirectiveVariable *sym,
+ const BrigObject *obj)
+{
+ const char *sym_name = obj->getString(sym->name);
+ uint64_t size = 0;
+ uint64_t offset = 0;
+
+ if (sym->type & BRIG_TYPE_ARRAY) {
+ size = getBrigDataTypeBytes(sym->type & ~BRIG_TYPE_ARRAY);
+ size *= (((uint64_t)sym->dim.hi) << 32 | (uint64_t)sym->dim.lo);
+
+ offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type &
+ ~BRIG_TYPE_ARRAY));
+ } else {
+ size = getBrigDataTypeBytes(sym->type);
+ offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type));
+ }
+
+ nextOffset = offset + size;
+
+ DPRINTF(HSAILObject, "Adding %s SYMBOL %s size %d offset 0x%x, init: %d\n",
+ segmentNames[segment], sym_name, size, offset, sym->init);
+
+ StorageElement* se = new StorageElement(sym_name, offset, size, sym);
+ elements.push_back(se);
+ elements_by_addr.insert(AddrRange(offset, offset + size - 1), se);
+ elements_by_brigptr[sym] = se;
+
+ return se;
+}
+
+StorageElement*
+StorageSpace::findSymbol(std::string name)
+{
+ for (auto it : elements) {
+ if (it->name == name) {
+ return it;
+ }
+ }
+
+ return nullptr;
+}
+
+StorageElement*
+StorageSpace::findSymbol(uint64_t addr)
+{
+ assert(elements_by_addr.size() > 0);
+
+ auto se = elements_by_addr.find(addr);
+
+ if (se == elements_by_addr.end()) {
+ return nullptr;
+ } else {
+ return se->second;
+ }
+}
+
+StorageElement*
+StorageSpace::findSymbol(const BrigDirectiveVariable *brigptr)
+{
+ assert(elements_by_brigptr.size() > 0);
+
+ auto se = elements_by_brigptr.find(brigptr);
+
+ if (se == elements_by_brigptr.end()) {
+ return nullptr;
+ } else {
+ return se->second;
+ }
+}
+
+StorageMap::StorageMap(StorageMap *outerScope)
+ : outerScopeMap(outerScope)
+{
+ for (int i = 0; i < NumSegments; ++i)
+ space[i] = new StorageSpace((BrigSegment)i);
+}
+
+StorageElement*
+StorageMap::addSymbol(const BrigDirectiveVariable *sym, const BrigObject *obj)
+{
+ BrigSegment8_t segment = sym->segment;
+
+ assert(segment >= Brig::BRIG_SEGMENT_FLAT);
+ assert(segment < NumSegments);
+
+ return space[segment]->addSymbol(sym, obj);
+}
+
+int
+StorageMap::getSize(Brig::BrigSegment segment)
+{
+ assert(segment > Brig::BRIG_SEGMENT_GLOBAL);
+ assert(segment < NumSegments);
+
+ if (segment != Brig::BRIG_SEGMENT_GROUP &&
+ segment != Brig::BRIG_SEGMENT_READONLY) {
+ return space[segment]->getSize();
+ } else {
+ int ret = space[segment]->getSize();
+
+ if (outerScopeMap) {
+ ret += outerScopeMap->getSize(segment);
+ }
+
+ return ret;
+ }
+}
+
+void
+StorageMap::resetOffset(Brig::BrigSegment segment)
+{
+ space[segment]->resetOffset();
+}
+
+StorageElement*
+StorageMap::findSymbol(BrigSegment segment, std::string name)
+{
+ StorageElement *se = space[segment]->findSymbol(name);
+
+ if (se)
+ return se;
+
+ if (outerScopeMap)
+ return outerScopeMap->findSymbol(segment, name);
+
+ return nullptr;
+}
+
+StorageElement*
+StorageMap::findSymbol(Brig::BrigSegment segment, uint64_t addr)
+{
+ StorageSpace *sp = space[segment];
+
+ if (!sp) {
+ // there is no memory in segment?
+ return nullptr;
+ }
+
+ StorageElement *se = sp->findSymbol(addr);
+
+ if (se)
+ return se;
+
+ if (outerScopeMap)
+ return outerScopeMap->findSymbol(segment, addr);
+
+ return nullptr;
+
+}
+
+StorageElement*
+StorageMap::findSymbol(Brig::BrigSegment segment,
+ const BrigDirectiveVariable *brigptr)
+{
+ StorageSpace *sp = space[segment];
+
+ if (!sp) {
+ // there is no memory in segment?
+ return nullptr;
+ }
+
+ StorageElement *se = sp->findSymbol(brigptr);
+
+ if (se)
+ return se;
+
+ if (outerScopeMap)
+ return outerScopeMap->findSymbol(segment, brigptr);
+
+ return nullptr;
+
+}
diff --git a/src/gpu-compute/hsail_code.hh b/src/gpu-compute/hsail_code.hh
new file mode 100644
index 000000000..d9fbcc577
--- /dev/null
+++ b/src/gpu-compute/hsail_code.hh
@@ -0,0 +1,447 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __HSAIL_CODE_HH__
+#define __HSAIL_CODE_HH__
+
+#include <cassert>
+#include <list>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "arch/gpu_decoder.hh"
+#include "arch/hsail/Brig.h"
+#include "base/addr_range_map.hh"
+#include "base/intmath.hh"
+#include "config/the_gpu_isa.hh"
+#include "gpu-compute/hsa_code.hh"
+#include "gpu-compute/hsa_kernel_info.hh"
+#include "gpu-compute/misc.hh"
+
+class BrigObject;
+class GPUStaticInst;
+
+inline int
+popcount(uint64_t src, int sz)
+{
+ int cnt = 0;
+
+ for (int i = 0; i < sz; ++i) {
+ if (src & 1)
+ ++cnt;
+ src >>= 1;
+ }
+
+ return cnt;
+}
+
+inline int
+firstbit(uint64_t src, int sz)
+{
+ int i;
+
+ for (i = 0; i < sz; ++i) {
+ if (src & 1)
+ break;
+ src >>= 1;
+ }
+
+ return i;
+}
+
+inline int
+lastbit(uint64_t src, int sz)
+{
+ int i0 = -1;
+
+ for (int i = 0; i < sz; ++i) {
+ if (src & 1)
+ i0 = i;
+ src >>= 1;
+ }
+
+ return i0;
+}
+
+inline int
+signbit(uint64_t src, int sz)
+{
+ int i0 = -1;
+
+ if (src & (1 << (sz - 1))) {
+ for (int i = 0; i < sz - 1; ++i) {
+ if (!(src & 1))
+ i0 = i;
+ src >>= 1;
+ }
+ } else {
+ for (int i = 0; i < sz - 1; ++i) {
+ if (src & 1)
+ i0 = i;
+ src >>= 1;
+ }
+ }
+
+ return i0;
+}
+
+inline uint64_t
+bitrev(uint64_t src, int sz)
+{
+ uint64_t r = 0;
+
+ for (int i = 0; i < sz; ++i) {
+ r <<= 1;
+ if (src & 1)
+ r |= 1;
+ src >>= 1;
+ }
+
+ return r;
+}
+
+inline uint64_t
+mul_hi(uint32_t a, uint32_t b)
+{
+ return ((uint64_t)a * (uint64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(int32_t a, int32_t b)
+{
+ return ((int64_t)a * (int64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(uint64_t a, uint64_t b)
+{
+ return ((uint64_t)a * (uint64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(int64_t a, int64_t b)
+{
+ return ((int64_t)a * (int64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(double a, double b)
+{
+ return 0;
+}
+
+class Label
+{
+ public:
+ std::string name;
+ int value;
+
+ Label() : value(-1)
+ {
+ }
+
+ bool defined() { return value != -1; }
+
+ void
+ checkName(std::string &_name)
+ {
+ if (name.empty()) {
+ name = _name;
+ } else {
+ assert(name == _name);
+ }
+ }
+
+ void
+ define(std::string &_name, int _value)
+ {
+ assert(!defined());
+ assert(_value != -1);
+ value = _value;
+ checkName(_name);
+ }
+
+ int
+ get()
+ {
+ assert(defined());
+ return value;
+ }
+};
+
+class LabelMap
+{
+ std::map<std::string, Label> map;
+
+ public:
+ LabelMap() { }
+
+ void addLabel(const Brig::BrigDirectiveLabel *lbl, int inst_index,
+ const BrigObject *obj);
+
+ Label *refLabel(const Brig::BrigDirectiveLabel *lbl,
+ const BrigObject *obj);
+};
+
+const int NumSegments = Brig::BRIG_SEGMENT_AMD_GCN;
+
+extern const char *segmentNames[];
+
+class StorageElement
+{
+ public:
+ std::string name;
+ uint64_t offset;
+
+ uint64_t size;
+ const Brig::BrigDirectiveVariable *brigSymbol;
+ StorageElement(const char *_name, uint64_t _offset, int _size,
+ const Brig::BrigDirectiveVariable *sym)
+ : name(_name), offset(_offset), size(_size), brigSymbol(sym)
+ {
+ }
+};
+
+class StorageSpace
+{
+ typedef std::map<const Brig::BrigDirectiveVariable*, StorageElement*>
+ DirVarToSE_map;
+
+ std::list<StorageElement*> elements;
+ AddrRangeMap<StorageElement*> elements_by_addr;
+ DirVarToSE_map elements_by_brigptr;
+
+ uint64_t nextOffset;
+ Brig::BrigSegment segment;
+
+ public:
+ StorageSpace(Brig::BrigSegment _class)
+ : nextOffset(0), segment(_class)
+ {
+ }
+
+ StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym,
+ const BrigObject *obj);
+
+ StorageElement* findSymbol(std::string name);
+ StorageElement* findSymbol(uint64_t addr);
+ StorageElement* findSymbol(const Brig::BrigDirectiveVariable *brigptr);
+
+ int getSize() { return nextOffset; }
+ void resetOffset() { nextOffset = 0; }
+};
+
+class StorageMap
+{
+ StorageMap *outerScopeMap;
+ StorageSpace *space[NumSegments];
+
+ public:
+ StorageMap(StorageMap *outerScope = nullptr);
+
+ StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym,
+ const BrigObject *obj);
+
+ StorageElement* findSymbol(Brig::BrigSegment segment, std::string name);
+ StorageElement* findSymbol(Brig::BrigSegment segment, uint64_t addr);
+
+ StorageElement* findSymbol(Brig::BrigSegment segment,
+ const Brig::BrigDirectiveVariable *brigptr);
+
+ // overloaded version to avoid casting
+ StorageElement*
+ findSymbol(Brig::BrigSegment8_t segment, std::string name)
+ {
+ return findSymbol((Brig::BrigSegment)segment, name);
+ }
+
+ int getSize(Brig::BrigSegment segment);
+ void resetOffset(Brig::BrigSegment segment);
+};
+
+typedef enum
+{
+ BT_DEFAULT,
+ BT_B8,
+ BT_U8,
+ BT_U16,
+ BT_U32,
+ BT_U64,
+ BT_S8,
+ BT_S16,
+ BT_S32,
+ BT_S64,
+ BT_F16,
+ BT_F32,
+ BT_F64,
+ BT_NULL
+} base_type_e;
+
+/* @class HsailCode
+ * the HsailCode class is used to store information
+ * about HSA kernels stored in the BRIG format. it holds
+ * all information about a kernel, function, or variable
+ * symbol and provides methods for accessing that
+ * information.
+ */
+
+class HsailCode final : public HsaCode
+{
+ public:
+ TheGpuISA::Decoder decoder;
+
+ StorageMap *storageMap;
+ LabelMap labelMap;
+ uint32_t kernarg_start;
+ uint32_t kernarg_end;
+ int32_t private_size;
+
+ int32_t readonly_size;
+
+ // We track the maximum register index used for each register
+ // class when we load the code so we can size the register files
+ // appropriately (i.e., one more than the max index).
+ uint32_t max_creg; // maximum c-register index
+ uint32_t max_sreg; // maximum s-register index
+ uint32_t max_dreg; // maximum d-register index
+
+ HsailCode(const std::string &name_str,
+ const Brig::BrigDirectiveExecutable *code_dir,
+ const BrigObject *obj,
+ StorageMap *objStorageMap);
+
+ // this version is used to create a placeholder when
+ // we encounter a kernel-related directive before the
+ // kernel itself
+ HsailCode(const std::string &name_str);
+
+ void init(const Brig::BrigDirectiveExecutable *code_dir,
+ const BrigObject *obj, StorageMap *objStorageMap);
+
+ void
+ generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const
+ {
+ hsaKernelInfo->sRegCount = max_sreg + 1;
+ hsaKernelInfo->dRegCount = max_dreg + 1;
+ hsaKernelInfo->cRegCount = max_creg + 1;
+
+ hsaKernelInfo->static_lds_size = getSize(Brig::BRIG_SEGMENT_GROUP);
+
+ hsaKernelInfo->private_mem_size =
+ roundUp(getSize(Brig::BRIG_SEGMENT_PRIVATE), 8);
+
+ hsaKernelInfo->spill_mem_size =
+ roundUp(getSize(Brig::BRIG_SEGMENT_SPILL), 8);
+ }
+
+ int
+ getSize(MemorySegment segment) const
+ {
+ Brig::BrigSegment brigSeg;
+
+ switch (segment) {
+ case MemorySegment::NONE:
+ brigSeg = Brig::BRIG_SEGMENT_NONE;
+ break;
+ case MemorySegment::FLAT:
+ brigSeg = Brig::BRIG_SEGMENT_FLAT;
+ break;
+ case MemorySegment::GLOBAL:
+ brigSeg = Brig::BRIG_SEGMENT_GLOBAL;
+ break;
+ case MemorySegment::READONLY:
+ brigSeg = Brig::BRIG_SEGMENT_READONLY;
+ break;
+ case MemorySegment::KERNARG:
+ brigSeg = Brig::BRIG_SEGMENT_KERNARG;
+ break;
+ case MemorySegment::GROUP:
+ brigSeg = Brig::BRIG_SEGMENT_GROUP;
+ break;
+ case MemorySegment::PRIVATE:
+ brigSeg = Brig::BRIG_SEGMENT_PRIVATE;
+ break;
+ case MemorySegment::SPILL:
+ brigSeg = Brig::BRIG_SEGMENT_SPILL;
+ break;
+ case MemorySegment::ARG:
+ brigSeg = Brig::BRIG_SEGMENT_ARG;
+ break;
+ case MemorySegment::EXTSPACE0:
+ brigSeg = Brig::BRIG_SEGMENT_AMD_GCN;
+ break;
+ default:
+ fatal("Unknown BrigSegment type.\n");
+ }
+
+ return getSize(brigSeg);
+ }
+
+ private:
+ int
+ getSize(Brig::BrigSegment segment) const
+ {
+ if (segment == Brig::BRIG_SEGMENT_PRIVATE) {
+ // with the code generated by new HSA compiler the assertion
+ // does not hold anymore..
+ //assert(private_size != -1);
+ return private_size;
+ } else {
+ return storageMap->getSize(segment);
+ }
+ }
+
+ public:
+ StorageElement*
+ findSymbol(Brig::BrigSegment segment, uint64_t addr)
+ {
+ return storageMap->findSymbol(segment, addr);
+ }
+
+ void
+ setPrivateSize(int32_t _private_size)
+ {
+ private_size = _private_size;
+ }
+
+ Label*
+ refLabel(const Brig::BrigDirectiveLabel *lbl, const BrigObject *obj)
+ {
+ return labelMap.refLabel(lbl, obj);
+ }
+};
+
+#endif // __HSAIL_CODE_HH__
diff --git a/src/gpu-compute/kernel_cfg.cc b/src/gpu-compute/kernel_cfg.cc
new file mode 100644
index 000000000..7e0e10912
--- /dev/null
+++ b/src/gpu-compute/kernel_cfg.cc
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "gpu-compute/kernel_cfg.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <string>
+
+#include "gpu-compute/gpu_static_inst.hh"
+
+void
+ControlFlowInfo::assignImmediatePostDominators(
+ const std::vector<GPUStaticInst*>& instructions)
+{
+ ControlFlowInfo cfg(instructions);
+ cfg.findImmediatePostDominators();
+}
+
+
+ControlFlowInfo::ControlFlowInfo(const std::vector<GPUStaticInst*>& insts) :
+ instructions(insts)
+{
+ createBasicBlocks();
+ connectBasicBlocks();
+}
+
+BasicBlock*
+ControlFlowInfo::basicBlock(int inst_num) const {
+ for (auto& block: basicBlocks) {
+ int first_block_id = block->firstInstruction->instNum();
+ if (inst_num >= first_block_id &&
+ inst_num < first_block_id + block->size) {
+ return block.get();
+ }
+ }
+ return nullptr;
+}
+
+
+GPUStaticInst*
+ControlFlowInfo::lastInstruction(const BasicBlock* block) const
+{
+ if (block->isExit()) {
+ return nullptr;
+ }
+
+ return instructions.at(block->firstInstruction->instNum() +
+ block->size - 1);
+}
+
+BasicBlock*
+ControlFlowInfo::postDominator(const BasicBlock* block) const
+{
+ if (block->isExit()) {
+ return nullptr;
+ }
+ return basicBlock(lastInstruction(block)->ipdInstNum());
+}
+
+void
+ControlFlowInfo::createBasicBlocks()
+{
+ assert(!instructions.empty());
+ std::set<int> leaders;
+ // first instruction is a leader
+ leaders.insert(0);
+ for (int i = 1; i < instructions.size(); i++) {
+ GPUStaticInst* instruction = instructions[i];
+ if (instruction->o_type == Enums::OT_BRANCH) {
+ const int target_pc = instruction->getTargetPc();
+ leaders.insert(target_pc);
+ leaders.insert(i + 1);
+ }
+ }
+
+ size_t block_size = 0;
+ for (int i = 0; i < instructions.size(); i++) {
+ if (leaders.find(i) != leaders.end()) {
+ uint32_t id = basicBlocks.size();
+ if (id > 0) {
+ basicBlocks.back()->size = block_size;
+ }
+ block_size = 0;
+ basicBlocks.emplace_back(new BasicBlock(id, instructions[i]));
+ }
+ block_size++;
+ }
+ basicBlocks.back()->size = block_size;
+ // exit basic block
+ basicBlocks.emplace_back(new BasicBlock(basicBlocks.size(), nullptr));
+}
+
+void
+ControlFlowInfo::connectBasicBlocks()
+{
+ BasicBlock* exit_bb = basicBlocks.back().get();
+ for (auto& bb : basicBlocks) {
+ if (bb->isExit()) {
+ break;
+ }
+ GPUStaticInst* last = lastInstruction(bb.get());
+ if (last->o_type == Enums::OT_RET) {
+ bb->successorIds.insert(exit_bb->id);
+ break;
+ }
+ if (last->o_type == Enums::OT_BRANCH) {
+ const uint32_t target_pc = last->getTargetPc();
+ BasicBlock* target_bb = basicBlock(target_pc);
+ bb->successorIds.insert(target_bb->id);
+ }
+
+ // Unconditional jump instructions have a unique successor
+ if (!last->unconditionalJumpInstruction()) {
+ BasicBlock* next_bb = basicBlock(last->instNum() + 1);
+ bb->successorIds.insert(next_bb->id);
+ }
+ }
+}
+
+
+// In-place set intersection
+static void
+intersect(std::set<uint32_t>& a, const std::set<uint32_t>& b)
+{
+ std::set<uint32_t>::iterator it = a.begin();
+ while (it != a.end()) {
+ it = b.find(*it) != b.end() ? ++it : a.erase(it);
+ }
+}
+
+
+void
+ControlFlowInfo::findPostDominators()
+{
+ // the only postdominator of the exit block is itself
+ basicBlocks.back()->postDominatorIds.insert(basicBlocks.back()->id);
+ //copy all basic blocks to all postdominator lists except for exit block
+ for (auto& block : basicBlocks) {
+ if (!block->isExit()) {
+ for (uint32_t i = 0; i < basicBlocks.size(); i++) {
+ block->postDominatorIds.insert(i);
+ }
+ }
+ }
+
+ bool change = true;
+ while (change) {
+ change = false;
+ for (int h = basicBlocks.size() - 2; h >= 0; --h) {
+ size_t num_postdominators =
+ basicBlocks[h]->postDominatorIds.size();
+ for (int s : basicBlocks[h]->successorIds) {
+ intersect(basicBlocks[h]->postDominatorIds,
+ basicBlocks[s]->postDominatorIds);
+ }
+ basicBlocks[h]->postDominatorIds.insert(h);
+ change |= (num_postdominators
+ != basicBlocks[h]->postDominatorIds.size());
+ }
+ }
+}
+
+
+// In-place set difference
+static void
+setDifference(std::set<uint32_t>&a,
+ const std::set<uint32_t>& b, uint32_t exception)
+{
+ for (uint32_t b_elem : b) {
+ if (b_elem != exception) {
+ a.erase(b_elem);
+ }
+ }
+}
+
+void
+ControlFlowInfo::findImmediatePostDominators()
+{
+ assert(basicBlocks.size() > 1); // Entry and exit blocks must be present
+
+ findPostDominators();
+
+ for (auto& basicBlock : basicBlocks) {
+ if (basicBlock->isExit()) {
+ continue;
+ }
+ std::set<uint32_t> candidates = basicBlock->postDominatorIds;
+ candidates.erase(basicBlock->id);
+ for (uint32_t postDominatorId : basicBlock->postDominatorIds) {
+ if (postDominatorId != basicBlock->id) {
+ setDifference(candidates,
+ basicBlocks[postDominatorId]->postDominatorIds,
+ postDominatorId);
+ }
+ }
+ assert(candidates.size() == 1);
+ GPUStaticInst* last_instruction = lastInstruction(basicBlock.get());
+ BasicBlock* ipd_block = basicBlocks[*(candidates.begin())].get();
+ if (!ipd_block->isExit()) {
+ GPUStaticInst* ipd_first_inst = ipd_block->firstInstruction;
+ last_instruction->ipdInstNum(ipd_first_inst->instNum());
+ } else {
+ last_instruction->ipdInstNum(last_instruction->instNum() + 1);
+ }
+ }
+}
+
+void
+ControlFlowInfo::printPostDominators() const
+{
+ for (auto& block : basicBlocks) {
+ std::cout << "PD(" << block->id << ") = {";
+ std::copy(block->postDominatorIds.begin(),
+ block->postDominatorIds.end(),
+ std::ostream_iterator<uint32_t>(std::cout, ", "));
+ std::cout << "}" << std::endl;
+ }
+}
+
+void
+ControlFlowInfo::printImmediatePostDominators() const
+{
+ for (const auto& block : basicBlocks) {
+ if (block->isExit()) {
+ continue;
+ }
+ std::cout << "IPD(" << block->id << ") = ";
+ std::cout << postDominator(block.get())->id << ", ";
+ }
+ std::cout << std::endl;
+}
+void
+ControlFlowInfo::printBasicBlocks() const
+{
+ for (GPUStaticInst* inst : instructions) {
+ int inst_num = inst->instNum();
+ std::cout << inst_num << " [" << basicBlock(inst_num)->id
+ << "]: " << inst->disassemble();
+ if (inst->o_type == Enums::OT_BRANCH) {
+ std::cout << ", PC = " << inst->getTargetPc();
+ }
+ std::cout << std::endl;
+ }
+}
+
+void
+ControlFlowInfo::printBasicBlockDot() const
+{
+ printf("digraph {\n");
+ for (const auto& basic_block : basicBlocks) {
+ printf("\t");
+ for (uint32_t successorId : basic_block->successorIds) {
+ printf("%d -> %d; ", basic_block->id, successorId);
+ }
+ printf("\n");
+ }
+ printf("}\n");
+}
diff --git a/src/gpu-compute/kernel_cfg.hh b/src/gpu-compute/kernel_cfg.hh
new file mode 100644
index 000000000..74ea861d8
--- /dev/null
+++ b/src/gpu-compute/kernel_cfg.hh
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __KERNEL_CFG_HH__
+#define __KERNEL_CFG_HH__
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <set>
+#include <vector>
+
+
+class GPUStaticInst;
+class HsailCode;
+
+struct BasicBlock
+{
+ BasicBlock(uint32_t num, GPUStaticInst* begin) :
+ id(num), size(0), firstInstruction(begin)
+ {
+ }
+
+ bool
+ isEntry() const
+ {
+ return !id;
+ }
+
+ bool
+ isExit() const
+ {
+ return !size;
+ }
+
+ /**
+ * Unique identifier for the block within a given kernel.
+ */
+ const uint32_t id;
+
+ /**
+ * Number of instructions contained in the block
+ */
+ size_t size;
+
+ /**
+ * Pointer to first instruction of the block.
+ */
+ GPUStaticInst* firstInstruction;
+
+ /**
+ * Identifiers of the blocks that follow (are reachable from) this block.
+ */
+ std::set<uint32_t> successorIds;
+
+ /**
+ * Identifiers of the blocks that will be visited from this block.
+ */
+ std::set<uint32_t> postDominatorIds;
+};
+
+class ControlFlowInfo
+{
+public:
+
+ /**
+ * Compute immediate post-dominator instruction for kernel instructions.
+ */
+ static void assignImmediatePostDominators(
+ const std::vector<GPUStaticInst*>& instructions);
+
+private:
+ ControlFlowInfo(const std::vector<GPUStaticInst*>& instructions);
+
+ GPUStaticInst* lastInstruction(const BasicBlock* block) const;
+
+ BasicBlock* basicBlock(int inst_num) const;
+
+ BasicBlock* postDominator(const BasicBlock* block) const;
+
+ void createBasicBlocks();
+
+ void connectBasicBlocks();
+
+ void findPostDominators();
+
+ void findImmediatePostDominators();
+
+ void printBasicBlocks() const;
+
+ void printBasicBlockDot() const;
+
+ void printPostDominators() const;
+
+ void printImmediatePostDominators() const;
+
+ std::vector<std::unique_ptr<BasicBlock>> basicBlocks;
+ std::vector<GPUStaticInst*> instructions;
+};
+
+#endif // __KERNEL_CFG_HH__
diff --git a/src/gpu-compute/lds_state.cc b/src/gpu-compute/lds_state.cc
new file mode 100644
index 000000000..91ee8009a
--- /dev/null
+++ b/src/gpu-compute/lds_state.cc
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Joe Gross
+ */
+
+#include "gpu-compute/lds_state.hh"
+
+#include <array>
+#include <cstdio>
+#include <cstdlib>
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+
+/**
+ * the default constructor that works with SWIG
+ */
+LdsState::LdsState(const Params *params) :
+ MemObject(params),
+ tickEvent(this),
+ cuPort(name() + ".port", this),
+ maximumSize(params->size),
+ range(params->range),
+ bankConflictPenalty(params->bankConflictPenalty),
+ banks(params->banks)
+{
+ fatal_if(params->banks <= 0,
+ "Number of LDS banks should be positive number");
+ fatal_if((params->banks & (params->banks - 1)) != 0,
+ "Number of LDS banks should be a power of 2");
+ fatal_if(params->size <= 0,
+ "cannot allocate an LDS with a size less than 1");
+ fatal_if(params->size % 2,
+ "the LDS should be an even number");
+}
+
+/**
+ * Needed by the SWIG compiler
+ */
+LdsState *
+LdsStateParams::create()
+{
+ return new LdsState(this);
+}
+
+/**
+ * set the parent and name based on the parent
+ */
+void
+LdsState::setParent(ComputeUnit *x_parent)
+{
+ // check that this gets assigned to the same thing each time
+ fatal_if(!x_parent, "x_parent should not be nullptr");
+ fatal_if(x_parent == parent,
+ "should not be setting the parent twice");
+
+ parent = x_parent;
+ _name = x_parent->name() + ".LdsState";
+}
+
+/**
+ * derive the gpu mem packet from the packet and then count the bank conflicts
+ */
+unsigned
+LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses)
+{
+ Packet::SenderState *baseSenderState = packet->senderState;
+ while (baseSenderState->predecessor) {
+ baseSenderState = baseSenderState->predecessor;
+ }
+ const ComputeUnit::LDSPort::SenderState *senderState =
+ dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState);
+
+ fatal_if(!senderState,
+ "did not get the right sort of sender state");
+
+ GPUDynInstPtr gpuDynInst = senderState->getMemInst();
+
+ return countBankConflicts(gpuDynInst, bankAccesses);
+}
+
+// Count the total number of bank conflicts for the local memory packet
+unsigned
+LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst,
+ unsigned *numBankAccesses)
+{
+ int bank_conflicts = 0;
+ std::vector<int> bank;
+ // the number of LDS banks being touched by the memory instruction
+ int numBanks = std::min(parent->wfSize(), banks);
+ // if the wavefront size is larger than the number of LDS banks, we
+ // need to iterate over all work items to calculate the total
+ // number of bank conflicts
+ int groups = (parent->wfSize() > numBanks) ?
+ (parent->wfSize() / numBanks) : 1;
+ for (int i = 0; i < groups; i++) {
+ // Address Array holding all the work item addresses of an instruction
+ std::vector<Addr> addr_array;
+ addr_array.resize(numBanks, 0);
+ bank.clear();
+ bank.resize(banks, 0);
+ int max_bank = 0;
+
+ // populate the address array for all active work items
+ for (int j = 0; j < numBanks; j++) {
+ if (gpuDynInst->exec_mask[(i*numBanks)+j]) {
+ addr_array[j] = gpuDynInst->addr[(i*numBanks)+j];
+ } else {
+ addr_array[j] = std::numeric_limits<Addr>::max();
+ }
+ }
+
+ if (gpuDynInst->m_op == Enums::MO_LD ||
+ gpuDynInst->m_op == Enums::MO_ST) {
+ // mask identical addresses
+ for (int j = 0; j < numBanks; ++j) {
+ for (int j0 = 0; j0 < j; j0++) {
+ if (addr_array[j] != std::numeric_limits<Addr>::max()
+ && addr_array[j] == addr_array[j0]) {
+ addr_array[j] = std::numeric_limits<Addr>::max();
+ }
+ }
+ }
+ }
+ // calculate bank conflicts
+ for (int j = 0; j < numBanks; ++j) {
+ if (addr_array[j] != std::numeric_limits<Addr>::max()) {
+ int bankId = addr_array[j] % banks;
+ bank[bankId]++;
+ max_bank = std::max(max_bank, bank[bankId]);
+ // Count the number of LDS banks accessed.
+ // Since we have masked identical addresses all remaining
+ // accesses will need to be serialized if they access
+ // the same bank (bank conflict).
+ (*numBankAccesses)++;
+ }
+ }
+ bank_conflicts += max_bank;
+ }
+ panic_if(bank_conflicts > parent->wfSize(),
+ "Max bank conflicts should match num of work items per instr");
+ return bank_conflicts;
+}
+
+/**
+ * receive the packet from the CU
+ */
+bool
+LdsState::CuSidePort::recvTimingReq(PacketPtr packet)
+{
+ return ownerLds->processPacket(packet);
+}
+
+GPUDynInstPtr
+LdsState::getDynInstr(PacketPtr packet)
+{
+ ComputeUnit::LDSPort::SenderState *ss =
+ dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
+ packet->senderState);
+ return ss->getMemInst();
+}
+
+/**
+ * process an incoming packet, add it to the return queue
+ */
+bool
+LdsState::processPacket(PacketPtr packet)
+{
+ unsigned bankAccesses = 0;
+ // the number of conflicts this packet will have when accessing the LDS
+ unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
+ // count the total number of physical LDS bank accessed
+ parent->ldsBankAccesses += bankAccesses;
+ // count the LDS bank conflicts. A number set to 1 indicates one
+ // access per bank maximum so there are no bank conflicts
+ parent->ldsBankConflictDist.sample(bankConflicts-1);
+
+ GPUDynInstPtr dynInst = getDynInstr(packet);
+ // account for the LDS bank conflict overhead
+ int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() :
+ (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() :
+ parent->loadBusLength();
+ // delay for accessing the LDS
+ Tick processingTime =
+ parent->shader->ticks(bankConflicts * bankConflictPenalty) +
+ parent->shader->ticks(busLength);
+ // choose (delay + last packet in queue) or (now + delay) as the time to
+ // return this
+ Tick doneAt = earliestReturnTime() + processingTime;
+ // then store it for processing
+ return returnQueuePush(std::make_pair(doneAt, packet));
+}
+
+/**
+ * add this to the queue of packets to be returned
+ */
+bool
+LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair)
+{
+ // TODO add time limits (e.g. one packet per cycle) and queue size limits
+ // and implement flow control
+ returnQueue.push(thePair);
+
+ // if there is no set wakeup time, look through the queue
+ if (!tickEvent.scheduled()) {
+ process();
+ }
+
+ return true;
+}
+
+/**
+ * receive a packet in functional mode
+ */
+void
+LdsState::CuSidePort::recvFunctional(PacketPtr pkt)
+{
+ fatal("not implemented");
+}
+
+/**
+ * receive a retry for a response
+ */
+void
+LdsState::CuSidePort::recvRespRetry()
+{
+ // TODO verify that this is the right way to do this
+ assert(ownerLds->isRetryResp());
+ ownerLds->setRetryResp(false);
+ ownerLds->process();
+}
+
+/**
+ * receive a retry
+ */
+void
+LdsState::CuSidePort::recvRetry()
+{
+ fatal("not implemented");
+}
+
+/**
+ * look for packets to return at this time
+ */
+bool
+LdsState::process()
+{
+ Tick now = clockEdge();
+
+ // send back completed packets
+ while (!returnQueue.empty() && returnQueue.front().first <= now) {
+ PacketPtr packet = returnQueue.front().second;
+
+ ComputeUnit::LDSPort::SenderState *ss =
+ dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
+ packet->senderState);
+
+ GPUDynInstPtr gpuDynInst = ss->getMemInst();
+
+ gpuDynInst->initiateAcc(gpuDynInst);
+
+ packet->makeTimingResponse();
+
+ returnQueue.pop();
+
+ bool success = cuPort.sendTimingResp(packet);
+
+ if (!success) {
+ retryResp = true;
+ panic("have not handled timing responses being NACK'd when sent"
+ "back");
+ }
+ }
+
+ // determine the next wakeup time
+ if (!returnQueue.empty()) {
+
+ Tick next = returnQueue.front().first;
+
+ if (tickEvent.scheduled()) {
+
+ if (next < tickEvent.when()) {
+
+ tickEvent.deschedule();
+ tickEvent.schedule(next);
+ }
+ } else {
+ tickEvent.schedule(next);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * wake up at this time and perform specified actions
+ */
+void
+LdsState::TickEvent::process()
+{
+ ldsState->process();
+}
+
+/**
+ *
+ */
+void
+LdsState::regStats()
+{
+}
diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh
new file mode 100644
index 000000000..89f08a1d3
--- /dev/null
+++ b/src/gpu-compute/lds_state.hh
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Joe Gross
+ */
+
+#ifndef __LDS_STATE_HH__
+#define __LDS_STATE_HH__
+
+#include <array>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "enums/MemOpType.hh"
+#include "enums/MemType.hh"
+#include "gpu-compute/misc.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+#include "params/LdsState.hh"
+
+class ComputeUnit;
+
+/**
+ * this represents a slice of the overall LDS, intended to be associated with an
+ * individual workgroup
+ */
+class LdsChunk
+{
+ public:
+ LdsChunk(const uint32_t x_size):
+ chunk(x_size)
+ {
+ }
+
+ LdsChunk() {}
+
+ /**
+ * a read operation
+ */
+ template<class T>
+ T
+ read(const uint32_t index)
+ {
+ fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0");
+ fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
+ T *p0 = (T *) (&(chunk.at(index)));
+ return *p0;
+ }
+
+ /**
+ * a write operation
+ */
+ template<class T>
+ void
+ write(const uint32_t index, const T value)
+ {
+ fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0");
+ fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
+ T *p0 = (T *) (&(chunk.at(index)));
+ *p0 = value;
+ }
+
+ /**
+ * get the size of this chunk
+ */
+ std::vector<uint8_t>::size_type
+ size() const
+ {
+ return chunk.size();
+ }
+
+ protected:
+ // the actual data store for this slice of the LDS
+ std::vector<uint8_t> chunk;
+};
+
+// Local Data Share (LDS) State per Wavefront (contents of the LDS region
+// allocated to the WorkGroup of this Wavefront)
+class LdsState: public MemObject
+{
+ protected:
+
+ /**
+ * an event to allow event-driven execution
+ */
+ class TickEvent: public Event
+ {
+ protected:
+
+ LdsState *ldsState = nullptr;
+
+ Tick nextTick = 0;
+
+ public:
+
+ TickEvent(LdsState *_ldsState) :
+ ldsState(_ldsState)
+ {
+ }
+
+ virtual void
+ process();
+
+ void
+ schedule(Tick when)
+ {
+ mainEventQueue[0]->schedule(this, when);
+ }
+
+ void
+ deschedule()
+ {
+ mainEventQueue[0]->deschedule(this);
+ }
+ };
+
+ /**
+ * CuSidePort is the LDS Port closer to the CU side
+ */
+ class CuSidePort: public SlavePort
+ {
+ public:
+ CuSidePort(const std::string &_name, LdsState *_ownerLds) :
+ SlavePort(_name, _ownerLds), ownerLds(_ownerLds)
+ {
+ }
+
+ protected:
+ LdsState *ownerLds;
+
+ virtual bool
+ recvTimingReq(PacketPtr pkt);
+
+ virtual Tick
+ recvAtomic(PacketPtr pkt)
+ {
+ return 0;
+ }
+
+ virtual void
+ recvFunctional(PacketPtr pkt);
+
+ virtual void
+ recvRangeChange()
+ {
+ }
+
+ virtual void
+ recvRetry();
+
+ virtual void
+ recvRespRetry();
+
+ virtual AddrRangeList
+ getAddrRanges() const
+ {
+ AddrRangeList ranges;
+ ranges.push_back(ownerLds->getAddrRange());
+ return ranges;
+ }
+
+ template<typename T>
+ void
+ loadData(PacketPtr packet);
+
+ template<typename T>
+ void
+ storeData(PacketPtr packet);
+
+ template<typename T>
+ void
+ atomicOperation(PacketPtr packet);
+ };
+
+ protected:
+
+ // the lds reference counter
+ // The key is the workgroup ID and dispatch ID
+ // The value is the number of wavefronts that reference this LDS, as
+ // wavefronts are launched, the counter goes up for that workgroup and when
+ // they return it decreases, once it reaches 0 then this chunk of the LDS is
+ // returned to the available pool. However,it is deallocated on the 1->0
+ // transition, not whenever the counter is 0 as it always starts with 0 when
+ // the workgroup asks for space
+ std::unordered_map<uint32_t,
+ std::unordered_map<uint32_t, int32_t>> refCounter;
+
+ // the map that allows workgroups to access their own chunk of the LDS
+ std::unordered_map<uint32_t,
+ std::unordered_map<uint32_t, LdsChunk>> chunkMap;
+
+ // an event to allow the LDS to wake up at a specified time
+ TickEvent tickEvent;
+
+ // the queue of packets that are going back to the CU after a
+ // read/write/atomic op
+ // TODO need to make this have a maximum size to create flow control
+ std::queue<std::pair<Tick, PacketPtr>> returnQueue;
+
+ // whether or not there are pending responses
+ bool retryResp = false;
+
+ bool
+ process();
+
+ GPUDynInstPtr
+ getDynInstr(PacketPtr packet);
+
+ bool
+ processPacket(PacketPtr packet);
+
+ unsigned
+ countBankConflicts(PacketPtr packet, unsigned *bankAccesses);
+
+ unsigned
+ countBankConflicts(GPUDynInstPtr gpuDynInst,
+ unsigned *numBankAccesses);
+
+ public:
+ typedef LdsStateParams Params;
+
+ LdsState(const Params *params);
+
+ // prevent copy construction
+ LdsState(const LdsState&) = delete;
+
+ ~LdsState()
+ {
+ parent = nullptr;
+ }
+
+ const Params *
+ params() const
+ {
+ return dynamic_cast<const Params *>(_params);
+ }
+
+ bool
+ isRetryResp() const
+ {
+ return retryResp;
+ }
+
+ void
+ setRetryResp(const bool value)
+ {
+ retryResp = value;
+ }
+
+ // prevent assignment
+ LdsState &
+ operator=(const LdsState &) = delete;
+
+ /**
+ * use the dynamic wave id to create or just increase the reference count
+ */
+ int
+ increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
+ {
+ int refCount = getRefCounter(dispatchId, wgId);
+ fatal_if(refCount < 0,
+ "reference count should not be below zero");
+ return ++refCounter[dispatchId][wgId];
+ }
+
+ /**
+ * decrease the reference count after making sure it is in the list
+ * give back this chunk if the ref counter has reached 0
+ */
+ int
+ decreaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
+ {
+ int refCount = getRefCounter(dispatchId, wgId);
+
+ fatal_if(refCount <= 0,
+ "reference count should not be below zero or at zero to"
+ "decrement");
+
+ refCounter[dispatchId][wgId]--;
+
+ if (refCounter[dispatchId][wgId] == 0) {
+ releaseSpace(dispatchId, wgId);
+ return 0;
+ } else {
+ return refCounter[dispatchId][wgId];
+ }
+ }
+
+ /**
+ * return the current reference count for this workgroup id
+ */
+ int
+ getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
+ {
+ auto dispatchIter = chunkMap.find(dispatchId);
+ fatal_if(dispatchIter == chunkMap.end(),
+ "could not locate this dispatch id [%d]", dispatchId);
+
+ auto workgroup = dispatchIter->second.find(wgId);
+ fatal_if(workgroup == dispatchIter->second.end(),
+ "could not find this workgroup id within this dispatch id"
+ " did[%d] wgid[%d]", dispatchId, wgId);
+
+ auto refCountIter = refCounter.find(dispatchId);
+ if (refCountIter == refCounter.end()) {
+ fatal("could not locate this dispatch id [%d]", dispatchId);
+ } else {
+ auto workgroup = refCountIter->second.find(wgId);
+ if (workgroup == refCountIter->second.end()) {
+ fatal("could not find this workgroup id within this dispatch id"
+ " did[%d] wgid[%d]", dispatchId, wgId);
+ } else {
+ return refCounter.at(dispatchId).at(wgId);
+ }
+ }
+
+ fatal("should not reach this point");
+ return 0;
+ }
+
+ /**
+ * assign a parent and request this amount of space be set aside
+ * for this wgid
+ */
+ LdsChunk *
+ reserveSpace(const uint32_t dispatchId, const uint32_t wgId,
+ const uint32_t size)
+ {
+ if (chunkMap.find(dispatchId) != chunkMap.end()) {
+ fatal_if(
+ chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
+ "duplicate workgroup ID asking for space in the LDS "
+ "did[%d] wgid[%d]", dispatchId, wgId);
+ }
+
+ fatal_if(bytesAllocated + size > maximumSize,
+ "request would ask for more space than is available");
+
+ bytesAllocated += size;
+
+ chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
+ // make an entry for this workgroup
+ refCounter[dispatchId][wgId] = 0;
+
+ return &chunkMap[dispatchId][wgId];
+ }
+
+ bool
+ returnQueuePush(std::pair<Tick, PacketPtr> thePair);
+
+ Tick
+ earliestReturnTime() const
+ {
+ // TODO set to max(lastCommand+1, curTick())
+ return returnQueue.empty() ? curTick() : returnQueue.back().first;
+ }
+
+ void
+ setParent(ComputeUnit *x_parent);
+
+ void
+ regStats();
+
+ // accessors
+ ComputeUnit *
+ getParent() const
+ {
+ return parent;
+ }
+
+ std::string
+ getName()
+ {
+ return _name;
+ }
+
+ int
+ getBanks() const
+ {
+ return banks;
+ }
+
+ ComputeUnit *
+ getComputeUnit() const
+ {
+ return parent;
+ }
+
+ int
+ getBankConflictPenalty() const
+ {
+ return bankConflictPenalty;
+ }
+
+ /**
+ * get the allocated size for this workgroup
+ */
+ std::size_t
+ ldsSize(const uint32_t x_wgId)
+ {
+ return chunkMap[x_wgId].size();
+ }
+
+ AddrRange
+ getAddrRange() const
+ {
+ return range;
+ }
+
+ virtual BaseSlavePort &
+ getSlavePort(const std::string& if_name, PortID idx)
+ {
+ if (if_name == "cuPort") {
+ // TODO need to set name dynamically at this point?
+ return cuPort;
+ } else {
+ fatal("cannot resolve the port name " + if_name);
+ }
+ }
+
+ /**
+ * can this much space be reserved for a workgroup?
+ */
+ bool
+ canReserve(uint32_t x_size) const
+ {
+ return bytesAllocated + x_size <= maximumSize;
+ }
+
+ private:
+ /**
+ * give back the space
+ */
+ bool
+ releaseSpace(const uint32_t x_dispatchId, const uint32_t x_wgId)
+ {
+ auto dispatchIter = chunkMap.find(x_dispatchId);
+
+ if (dispatchIter == chunkMap.end()) {
+ fatal("dispatch id not found [%d]", x_dispatchId);
+ } else {
+ auto workgroupIter = dispatchIter->second.find(x_wgId);
+ if (workgroupIter == dispatchIter->second.end()) {
+ fatal("workgroup id [%d] not found in dispatch id [%d]",
+ x_wgId, x_dispatchId);
+ }
+ }
+
+ fatal_if(bytesAllocated < chunkMap[x_dispatchId][x_wgId].size(),
+ "releasing more space than was allocated");
+
+ bytesAllocated -= chunkMap[x_dispatchId][x_wgId].size();
+ chunkMap[x_dispatchId].erase(chunkMap[x_dispatchId].find(x_wgId));
+ return true;
+ }
+
+ // the port that connects this LDS to its owner CU
+ CuSidePort cuPort;
+
+ ComputeUnit* parent = nullptr;
+
+ std::string _name;
+
+ // the number of bytes currently reserved by all workgroups
+ int bytesAllocated = 0;
+
+ // the size of the LDS, the most bytes available
+ int maximumSize;
+
+ // Address range of this memory
+ AddrRange range;
+
+ // the penalty, in cycles, for each LDS bank conflict
+ int bankConflictPenalty = 0;
+
+ // the number of banks in the LDS underlying data store
+ int banks = 0;
+};
+
+#endif // __LDS_STATE_HH__
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc
new file mode 100644
index 000000000..7f919c5f4
--- /dev/null
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/local_memory_pipeline.hh"
+
+#include "debug/GPUPort.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p) :
+ computeUnit(nullptr), lmQueueSize(p->local_mem_queue_size)
+{
+}
+
+void
+LocalMemPipeline::init(ComputeUnit *cu)
+{
+ computeUnit = cu;
+ _name = computeUnit->name() + ".LocalMemPipeline";
+}
+
+void
+LocalMemPipeline::exec()
+{
+ // apply any returned shared (LDS) memory operations
+ GPUDynInstPtr m = !lmReturnedRequests.empty() ?
+ lmReturnedRequests.front() : nullptr;
+
+ bool accessVrf = true;
+ if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) {
+ Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+ accessVrf =
+ w->computeUnit->vrf[m->simdId]->
+ vrfOperandAccessReady(m->seqNum(), w, m,
+ VrfAccessType::WRITE);
+ }
+
+ if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
+ computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return
+ || computeUnit->wfWait.at(m->pipeId).rdy())) {
+ if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
+ doSmReturn<uint32_t, uint8_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
+ doSmReturn<uint32_t, uint16_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
+ doSmReturn<uint32_t, uint32_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
+ doSmReturn<int32_t, int8_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
+ doSmReturn<int32_t, int16_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
+ doSmReturn<int32_t, int32_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
+ doSmReturn<float, Float16>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
+ doSmReturn<float, float>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
+ doSmReturn<uint64_t, uint8_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
+ doSmReturn<uint64_t, uint16_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
+ doSmReturn<uint64_t, uint32_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
+ doSmReturn<uint64_t, uint64_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
+ doSmReturn<int64_t, int8_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
+ doSmReturn<int64_t, int16_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
+ doSmReturn<int64_t, int32_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
+ doSmReturn<int64_t, int64_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
+ doSmReturn<double, Float16>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
+ doSmReturn<double, float>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
+ doSmReturn<double, double>(m);
+ }
+
+ // If pipeline has executed a local memory instruction
+ // execute local memory packet and issue the packets
+ // to LDS
+ if (!lmIssuedRequests.empty() && lmReturnedRequests.size() < lmQueueSize) {
+
+ GPUDynInstPtr m = lmIssuedRequests.front();
+
+ bool returnVal = computeUnit->sendToLds(m);
+ if (!returnVal) {
+ DPRINTF(GPUPort, "packet was nack'd and put in retry queue");
+ }
+ lmIssuedRequests.pop();
+ }
+}
+
+template<typename c0, typename c1>
+void
+LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
+{
+ lmReturnedRequests.pop();
+ Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+ // Return data to registers
+ if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) {
+ std::vector<uint32_t> regVec;
+ for (int k = 0; k < m->n_reg; ++k) {
+ int dst = m->dst_reg+k;
+
+ if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
+ dst = m->dst_reg_vec[k];
+ // virtual->physical VGPR mapping
+ int physVgpr = w->remap(dst,sizeof(c0),1);
+ // save the physical VGPR index
+ regVec.push_back(physVgpr);
+ c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+
+ for (int i = 0; i < VSZ; ++i) {
+ if (m->exec_mask[i]) {
+ // write the value into the physical VGPR. This is a purely
+ // functional operation. No timing is modeled.
+ w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
+ *p1, i);
+ }
+ ++p1;
+ }
+ }
+
+ // Schedule the write operation of the load data on the VRF. This simply
+ // models the timing aspect of the VRF write operation. It does not
+ // modify the physical VGPR.
+ loadVrfBankConflictCycles +=
+ w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), w,
+ regVec, sizeof(c0), m->time);
+ }
+
+ // Decrement outstanding request count
+ computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1);
+
+ if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op)
+ || MO_H(m->m_op)) {
+ computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_lm,
+ m->time, -1);
+ }
+
+ if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
+ computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_lm,
+ m->time, -1);
+ }
+
+ // Mark write bus busy for appropriate amount of time
+ computeUnit->locMemToVrfBus.set(m->time);
+ if (computeUnit->shader->coissue_return == 0)
+ w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+}
+
+void
+LocalMemPipeline::regStats()
+{
+ loadVrfBankConflictCycles
+ .name(name() + ".load_vrf_bank_conflict_cycles")
+ .desc("total number of cycles LDS data are delayed before updating "
+ "the VRF")
+ ;
+}
diff --git a/src/gpu-compute/local_memory_pipeline.hh b/src/gpu-compute/local_memory_pipeline.hh
new file mode 100644
index 000000000..a63d867d0
--- /dev/null
+++ b/src/gpu-compute/local_memory_pipeline.hh
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __LOCAL_MEMORY_PIPELINE_HH__
+#define __LOCAL_MEMORY_PIPELINE_HH__
+
+#include <queue>
+#include <string>
+
+#include "gpu-compute/misc.hh"
+#include "params/ComputeUnit.hh"
+#include "sim/stats.hh"
+
+/*
+ * @file local_memory_pipeline.hh
+ *
+ * The local memory pipeline issues newly created local memory packets
+ * from pipeline to the LDS. This stage also retires previously issued
+ * loads and stores that have returned from the LDS.
+ */
+
+class ComputeUnit;
+class Wavefront;
+
+class LocalMemPipeline
+{
+ public:
+ LocalMemPipeline(const ComputeUnitParams *params);
+ void init(ComputeUnit *cu);
+ void exec();
+
+ template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr m);
+
+ std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; }
+ std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
+
+ bool
+ isLMRespFIFOWrRdy() const
+ {
+ return lmReturnedRequests.size() < lmQueueSize;
+ }
+
+ bool
+ isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
+ {
+ return (lmIssuedRequests.size() + pendReqs) < lmQueueSize;
+ }
+
+ const std::string& name() const { return _name; }
+ void regStats();
+
+ private:
+ ComputeUnit *computeUnit;
+ std::string _name;
+ int lmQueueSize;
+ Stats::Scalar loadVrfBankConflictCycles;
+ // Local Memory Request Fifo: all shared memory requests
+ // are issued to this FIFO from the memory pipelines
+ std::queue<GPUDynInstPtr> lmIssuedRequests;
+
+ // Local Memory Response Fifo: all responses of shared memory
+ // requests are sent to this FIFO from LDS
+ std::queue<GPUDynInstPtr> lmReturnedRequests;
+};
+
+#endif // __LOCAL_MEMORY_PIPELINE_HH__
diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh
new file mode 100644
index 000000000..4f8032832
--- /dev/null
+++ b/src/gpu-compute/misc.hh
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __MISC_HH__
+#define __MISC_HH__
+
+#include <bitset>
+#include <memory>
+
+#include "base/misc.hh"
+
+class GPUDynInst;
+
+// wavefront size of the machine
+static const int VSZ = 64;
+
+/*
+ This check is necessary because std::bitset only provides conversion to
+ unsigned long or unsigned long long via to_ulong() or to_ullong(). there are
+ a few places in the code where to_ullong() is used, however if VSZ is larger
+ than a value the host can support then bitset will throw a runtime exception.
+
+ we should remove all use of to_long() or to_ullong() so we can have VSZ
+ greater than 64b, however until that is done this assert is required.
+ */
+static_assert(VSZ <= sizeof(unsigned long long) * 8,
+ "VSZ is larger than the host can support");
+
+typedef std::bitset<VSZ> VectorMask;
+typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
+
+class WaitClass
+{
+ public:
+ WaitClass() : nxtAvail(0), lookAheadAvail(0), tcnt(0) { }
+ void init(uint64_t *_tcnt, uint32_t _numStages=0)
+ {
+ tcnt = _tcnt;
+ numStages = _numStages;
+ }
+
+ void set(uint32_t i)
+ {
+ fatal_if(nxtAvail > *tcnt,
+ "Can't allocate resource because it is busy!!!");
+ nxtAvail = *tcnt + i;
+ }
+ void preset(uint32_t delay)
+ {
+ lookAheadAvail = std::max(lookAheadAvail, delay + (*tcnt) - numStages);
+ }
+ bool rdy() const { return *tcnt >= nxtAvail; }
+ bool prerdy() const { return *tcnt >= lookAheadAvail; }
+
+ private:
+ // timestamp indicating when resource will be available
+ uint64_t nxtAvail;
+ // timestamp indicating when resource will be available including
+ // pending uses of the resource (when there is a cycle gap between
+ // rdy() and set()
+ uint64_t lookAheadAvail;
+ // current timestamp
+ uint64_t *tcnt;
+ // number of stages between checking if a resource is ready and
+ // setting the resource's utilization
+ uint32_t numStages;
+};
+
+class Float16
+{
+ public:
+ uint16_t val;
+
+ Float16() { val = 0; }
+
+ Float16(const Float16 &x) : val(x.val) { }
+
+ Float16(float x)
+ {
+ uint32_t ai = *(uint32_t *)&x;
+
+ uint32_t s = (ai >> 31) & 0x1;
+ uint32_t exp = (ai >> 23) & 0xff;
+ uint32_t mant = (ai >> 0) & 0x7fffff;
+
+ if (exp == 0 || exp <= 0x70) {
+ exp = 0;
+ mant = 0;
+ } else if (exp == 0xff) {
+ exp = 0x1f;
+ } else if (exp >= 0x8f) {
+ exp = 0x1f;
+ mant = 0;
+ } else {
+ exp = exp - 0x7f + 0x0f;
+ }
+
+ mant = mant >> 13;
+
+ val = 0;
+ val |= (s << 15);
+ val |= (exp << 10);
+ val |= (mant << 0);
+ }
+
+ operator float() const
+ {
+ uint32_t s = (val >> 15) & 0x1;
+ uint32_t exp = (val >> 10) & 0x1f;
+ uint32_t mant = (val >> 0) & 0x3ff;
+
+ if (!exp) {
+ exp = 0;
+ mant = 0;
+ } else if (exp == 0x1f) {
+ exp = 0xff;
+ } else {
+ exp = exp - 0x0f + 0x7f;
+ }
+
+ uint32_t val1 = 0;
+ val1 |= (s << 31);
+ val1 |= (exp << 23);
+ val1 |= (mant << 13);
+
+ return *(float*)&val1;
+ }
+};
+
+#endif // __MISC_HH__
diff --git a/src/gpu-compute/ndrange.hh b/src/gpu-compute/ndrange.hh
new file mode 100644
index 000000000..d1ad35d4b
--- /dev/null
+++ b/src/gpu-compute/ndrange.hh
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __NDRANGE_HH__
+#define __NDRANGE_HH__
+
+#include "base/types.hh"
+#include "gpu-compute/qstruct.hh"
+
+struct NDRange
+{
+ // copy of the queue entry provided at dispatch
+ HsaQueueEntry q;
+
+ // The current workgroup id (3 dimensions)
+ int wgId[3];
+ // The number of workgroups in each dimension
+ int numWg[3];
+ // The total number of workgroups
+ int numWgTotal;
+
+ // The number of completed work groups
+ int numWgCompleted;
+ // The global workgroup ID
+ uint32_t globalWgId;
+
+ // flag indicating whether all work groups have been launched
+ bool wg_disp_rem;
+ // kernel complete
+ bool execDone;
+ bool userDoorBellSet;
+ volatile bool *addrToNotify;
+ volatile uint32_t *numDispLeft;
+ int dispatchId;
+ int curTid; // Current thread id
+};
+
+#endif // __NDRANGE_HH__
diff --git a/src/gpu-compute/of_scheduling_policy.cc b/src/gpu-compute/of_scheduling_policy.cc
new file mode 100644
index 000000000..7f114706a
--- /dev/null
+++ b/src/gpu-compute/of_scheduling_policy.cc
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/of_scheduling_policy.hh"
+
+#include "gpu-compute/wavefront.hh"
+
+Wavefront*
+OFSchedulingPolicy::chooseWave()
+{
+ // Set when policy choose a wave to schedule
+ bool waveChosen = false;
+ Wavefront *selectedWave = nullptr;
+ int selectedWaveID = -1;
+ uint32_t selectedPosition = 0;
+
+ for (int position = 0; position < scheduleList->size(); ++position) {
+ Wavefront *curWave = scheduleList->at(position);
+ uint32_t curWaveID = curWave->wfDynId;
+
+ // Choosed wave with the lowest wave ID
+ if (selectedWaveID == -1 || curWaveID < selectedWaveID) {
+ waveChosen = true;
+ selectedWaveID = curWaveID;
+ selectedWave = curWave;
+ selectedPosition = position;
+ }
+ }
+
+ // Check to make sure ready list had atleast one schedulable wave
+ if (waveChosen) {
+ scheduleList->erase(scheduleList->begin() + selectedPosition);
+ } else {
+ panic("Empty ready list");
+ }
+
+ return selectedWave;
+}
+
+void
+OFSchedulingPolicy::bindList(std::vector<Wavefront*> *list)
+{
+ scheduleList = list;
+}
diff --git a/src/gpu-compute/of_scheduling_policy.hh b/src/gpu-compute/of_scheduling_policy.hh
new file mode 100644
index 000000000..684e51a3a
--- /dev/null
+++ b/src/gpu-compute/of_scheduling_policy.hh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __OF_SCHEDULING_POLICY_HH__
+#define __OF_SCHEDULING_POLICY_HH__
+
+#include <cstddef>
+#include <vector>
+
+#include "base/misc.hh"
+
+class Wavefront;
+
+// Oldest First where age is marked by the wave id
+class OFSchedulingPolicy
+{
+ public:
+ OFSchedulingPolicy() : scheduleList(nullptr) { }
+
+ Wavefront* chooseWave();
+ void bindList(std::vector<Wavefront*> *list);
+
+ private:
+ // List of waves which are participating in scheduling.
+ // This scheduler selects the oldest wave from this list
+ std::vector<Wavefront*> *scheduleList;
+};
+
+#endif // __OF_SCHEDULING_POLICY_HH__
diff --git a/src/gpu-compute/pool_manager.cc b/src/gpu-compute/pool_manager.cc
new file mode 100644
index 000000000..b1bc6b1f3
--- /dev/null
+++ b/src/gpu-compute/pool_manager.cc
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/pool_manager.hh"
+
+PoolManager::PoolManager(uint32_t minAlloc, uint32_t poolSize)
+ : _minAllocation(minAlloc), _poolSize(poolSize)
+{
+ assert(poolSize > 0);
+}
diff --git a/src/gpu-compute/pool_manager.hh b/src/gpu-compute/pool_manager.hh
new file mode 100644
index 000000000..2cb53ce72
--- /dev/null
+++ b/src/gpu-compute/pool_manager.hh
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __POOL_MANAGER_HH__
+#define __POOL_MANAGER_HH__
+
+#include <cassert>
+#include <cstdint>
+#include <string>
+
+// Pool Manager Logic
+class PoolManager
+{
+ public:
+ PoolManager(uint32_t minAlloc, uint32_t poolSize);
+ uint32_t minAllocation() { return _minAllocation; }
+ virtual std::string printRegion() = 0;
+ virtual uint32_t regionSize(std::pair<uint32_t,uint32_t> &region) = 0;
+ virtual bool canAllocate(uint32_t numRegions, uint32_t size) = 0;
+
+ virtual uint32_t allocateRegion(const uint32_t size,
+ uint32_t *reserved) = 0;
+
+ virtual void freeRegion(uint32_t firstIdx, uint32_t lastIdx) = 0;
+ uint32_t poolSize() { return _poolSize; }
+
+ private:
+ // minimum size that can be reserved per allocation
+ uint32_t _minAllocation;
+ // pool size in number of elements
+ uint32_t _poolSize;
+};
+
+#endif // __POOL_MANAGER_HH__
diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh
new file mode 100644
index 000000000..092303c00
--- /dev/null
+++ b/src/gpu-compute/qstruct.hh
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Marc Orr
+ */
+
+#ifndef __Q_STRUCT_HH__
+#define __Q_STRUCT_HH__
+
+#include <bitset>
+#include <cstdint>
+
+// Maximum number of arguments
+static const int KER_NUM_ARGS = 32;
+// Kernel argument buffer size
+static const int KER_ARGS_LENGTH = 512;
+
+class LdsChunk;
+struct NDRange;
+
+// Be very careful of alignment in this structure. The structure
+// must compile to the same layout in both 32-bit and 64-bit mode.
+struct HsaQueueEntry
+{
+ // Base pointer for array of instruction pointers
+ uint64_t code_ptr;
+ // Grid Size (3 dimensions)
+ uint32_t gdSize[3];
+ // Workgroup Size (3 dimensions)
+ uint32_t wgSize[3];
+ uint16_t sRegCount;
+ uint16_t dRegCount;
+ uint16_t cRegCount;
+ uint64_t privMemStart;
+ uint32_t privMemPerItem;
+ uint32_t privMemTotal;
+ uint64_t spillMemStart;
+ uint32_t spillMemPerItem;
+ uint32_t spillMemTotal;
+ uint64_t roMemStart;
+ uint32_t roMemTotal;
+ // Size (in bytes) of LDS
+ uint32_t ldsSize;
+ // Virtual Memory Id (unused right now)
+ uint32_t vmId;
+
+ // Pointer to dependency chain (unused now)
+ uint64_t depends;
+
+ // pointer to bool
+ uint64_t addrToNotify;
+ // pointer to uint32_t
+ uint64_t numDispLeft;
+
+ // variables to pass arguments when running in standalone mode,
+ // will be removed when run.py and sh.cpp have been updated to
+ // use args and offset arrays
+ uint64_t arg1;
+ uint64_t arg2;
+ uint64_t arg3;
+ uint64_t arg4;
+
+ // variables to pass arguments when running in cpu+gpu mode
+ uint8_t args[KER_ARGS_LENGTH];
+ uint16_t offsets[KER_NUM_ARGS];
+ uint16_t num_args;
+};
+
+// State used to start (or restart) a WF
+struct WFContext
+{
+ // 32 bit values
+ // barrier state
+ int bar_cnt[VSZ];
+
+ // id (which WF in the WG)
+ int cnt;
+
+ // more barrier state
+ int max_bar_cnt;
+ int old_barrier_cnt;
+ int barrier_cnt;
+
+ // More Program Counter Stuff
+ uint32_t pc;
+
+ // Program counter of the immediate post-dominator instruction
+ uint32_t rpc;
+
+ // WG wide state (I don't see how to avoid redundancy here)
+ int cu_id;
+ uint32_t wg_id;
+ uint32_t barrier_id;
+
+ // 64 bit values (these values depend on the wavefront size)
+ // masks
+ uint64_t init_mask;
+ uint64_t exec_mask;
+
+ // private memory;
+ Addr privBase;
+ Addr spillBase;
+
+ LdsChunk *ldsChunk;
+
+ /*
+ * Kernel wide state
+ * This is a hack. This state should be moved through simulated memory
+ * during a yield. Though not much is being used here, so it's probably
+ * probably not a big deal.
+ *
+ * Just to add to this comment... The ndr is derived from simulated
+ * memory when the cl-runtime allocates an HsaQueueEntry and populates it
+ * for a kernel launch. So in theory the runtime should be able to keep
+ * that state around. Then a WF can reference it upon restart to derive
+ * kernel wide state. The runtime can deallocate the state when the
+ * kernel completes.
+ */
+ NDRange *ndr;
+};
+
+// State that needs to be passed between the simulation and simulated app, a
+// pointer to this struct can be passed through the depends field in the
+// HsaQueueEntry struct
+struct HostState
+{
+ // cl_event* has original HsaQueueEntry for init
+ uint64_t event;
+};
+
+// Total number of HSA queues
+static const int HSAQ_NQUEUES = 8;
+
+// These values will eventually live in memory mapped registers
+// and be settable by the kernel mode driver.
+
+// Number of entries in each HSA queue
+static const int HSAQ_SIZE = 64;
+// Address of first HSA queue index
+static const int HSAQ_INDX_BASE = 0x10000ll;
+// Address of first HSA queue
+static const int HSAQ_BASE = 0x11000ll;
+// Suggested start of HSA code
+static const int HSA_CODE_BASE = 0x18000ll;
+
+// These are shortcuts for deriving the address of a specific
+// HSA queue or queue index
+#define HSAQ(n) (HSAQ_BASE + HSAQ_SIZE * sizeof(struct fsaQueue) * n)
+#define HSAQE(n,i) (HSAQ_BASE + (HSAQ_SIZE * n + i) * sizeof(struct fsaQueue))
+#define HSAQ_RI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 0))
+#define HSAQ_WI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 1))
+#define HSAQ_CI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 2))
+
+/*
+ * Example code for writing to a queue
+ *
+ * void
+ * ToQueue(int n,struct fsaQueue *val)
+ * {
+ * int wi = *(int*)HSAQ_WI(n);
+ * int ri = *(int*)HSAQ_RI(n);
+ * int ci = *(int*)HSAQ_CI(n);
+ *
+ * if (ci - ri < HSAQ_SIZE) {
+ * (*(int*)HSAQ_CI(n))++;
+ * *(HsaQueueEntry*)(HSAQE(n, (wi % HSAQ_SIZE))) = *val;
+ * (*(int*)HSAQ_WI(n))++;
+ * }
+ * }
+ */
+
+#endif // __Q_STRUCT_HH__
diff --git a/src/gpu-compute/rr_scheduling_policy.cc b/src/gpu-compute/rr_scheduling_policy.cc
new file mode 100644
index 000000000..5d3591901
--- /dev/null
+++ b/src/gpu-compute/rr_scheduling_policy.cc
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/rr_scheduling_policy.hh"
+
+#include "gpu-compute/wavefront.hh"
+
+Wavefront*
+RRSchedulingPolicy::chooseWave()
+{
+ Wavefront *selectedWave = nullptr;
+
+ // Check to make sure ready list had atleast one schedulable wave
+ if (scheduleList->size()) {
+ // For RR policy, select the wave which is at the
+ // front of the list. The selected wave is popped
+ // out from the schedule list immediately after selection
+ // to avoid starvation. It is the responsibility of the
+ // module invoking the RR scheduler to make surei scheduling
+ // eligible waves are added to the back of the schedule
+ // list
+ selectedWave = scheduleList->front();
+ scheduleList->erase(scheduleList->begin() + 0);
+ } else {
+ panic("Empty ready list");
+ }
+
+ return selectedWave;
+}
+
+void
+RRSchedulingPolicy::bindList(std::vector<Wavefront*> *list)
+{
+ scheduleList = list;
+}
diff --git a/src/gpu-compute/rr_scheduling_policy.hh b/src/gpu-compute/rr_scheduling_policy.hh
new file mode 100644
index 000000000..780f294aa
--- /dev/null
+++ b/src/gpu-compute/rr_scheduling_policy.hh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __RR_SCHEDULING_POLICY_HH__
+#define __RR_SCHEDULING_POLICY_HH__
+
+#include <inttypes.h>
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include "base/misc.hh"
+
+class Wavefront;
+
+// Round-Robin pick among the list of ready waves
+class RRSchedulingPolicy
+{
+ public:
+ RRSchedulingPolicy() : scheduleList(nullptr) { }
+
+ Wavefront* chooseWave();
+ void bindList(std::vector<Wavefront*> *list);
+
+ private:
+ // List of waves which are participating in scheduling.
+ // This scheduler selects one wave from this list based on
+ // round robin policy
+ std::vector<Wavefront*> *scheduleList;
+};
+
+#endif // __RR_SCHEDULING_POLICY_HH__
diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc
new file mode 100644
index 000000000..068136026
--- /dev/null
+++ b/src/gpu-compute/schedule_stage.cc
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/schedule_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+ScheduleStage::ScheduleStage(const ComputeUnitParams *p)
+ : numSIMDs(p->num_SIMDs),
+ numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes)
+{
+ for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+ Scheduler newScheduler(p);
+ scheduler.push_back(newScheduler);
+ }
+}
+
+ScheduleStage::~ScheduleStage()
+{
+ scheduler.clear();
+ waveStatusList.clear();
+}
+
+void
+ScheduleStage::init(ComputeUnit *cu)
+{
+ computeUnit = cu;
+ _name = computeUnit->name() + ".ScheduleStage";
+
+ for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+ scheduler[j].bindList(&computeUnit->readyList[j]);
+ }
+
+ for (int j = 0; j < numSIMDs; ++j) {
+ waveStatusList.push_back(&computeUnit->waveStatusList[j]);
+ }
+
+ dispatchList = &computeUnit->dispatchList;
+}
+
+void
+ScheduleStage::arbitrate()
+{
+ // iterate over all Memory pipelines
+ for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) {
+ if (dispatchList->at(j).first) {
+ Wavefront *waveToMemPipe = dispatchList->at(j).first;
+ // iterate over all execution pipelines
+ for (int i = 0; i < numSIMDs + numMemUnits; ++i) {
+ if ((i != j) && (dispatchList->at(i).first)) {
+ Wavefront *waveToExePipe = dispatchList->at(i).first;
+ // if the two selected wavefronts are mapped to the same
+ // SIMD unit then they share the VRF
+ if (waveToMemPipe->simdId == waveToExePipe->simdId) {
+ int simdId = waveToMemPipe->simdId;
+ // Read VRF port arbitration:
+ // If there are read VRF port conflicts between the
+ // a memory and another instruction we drop the other
+ // instruction. We don't need to check for write VRF
+ // port conflicts because the memory instruction either
+ // does not need to write to the VRF (store) or will
+ // write to the VRF when the data comes back (load) in
+ // which case the arbiter of the memory pipes will
+ // resolve any conflicts
+ if (computeUnit->vrf[simdId]->
+ isReadConflict(waveToMemPipe->wfSlotId,
+ waveToExePipe->wfSlotId)) {
+ // FIXME: The "second" member variable is never
+ // used in the model. I am setting it to READY
+ // simply to follow the protocol of setting it
+ // when the WF has an instruction ready to issue
+ waveStatusList[simdId]->at(waveToExePipe->wfSlotId)
+ .second = READY;
+
+ dispatchList->at(i).first = nullptr;
+ dispatchList->at(i).second = EMPTY;
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void
+ScheduleStage::exec()
+{
+ for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+ uint32_t readyListSize = computeUnit->readyList[j].size();
+
+ // If no wave is ready to be scheduled on the execution resource
+ // then skip scheduling for this execution resource
+ if (!readyListSize) {
+ continue;
+ }
+
+ Wavefront *waveToBeDispatched = scheduler[j].chooseWave();
+ dispatchList->at(j).first = waveToBeDispatched;
+ waveToBeDispatched->updateResources();
+ dispatchList->at(j).second = FILLED;
+
+ waveStatusList[waveToBeDispatched->simdId]->at(
+ waveToBeDispatched->wfSlotId).second = BLOCKED;
+
+ assert(computeUnit->readyList[j].size() == readyListSize - 1);
+ }
+ // arbitrate over all shared resources among instructions being issued
+ // simultaneously
+ arbitrate();
+}
+
+void
+ScheduleStage::regStats()
+{
+}
diff --git a/src/gpu-compute/schedule_stage.hh b/src/gpu-compute/schedule_stage.hh
new file mode 100644
index 000000000..26eb9a25b
--- /dev/null
+++ b/src/gpu-compute/schedule_stage.hh
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCHEDULE_STAGE_HH__
+#define __SCHEDULE_STAGE_HH__
+
+#include <utility>
+#include <vector>
+
+#include "gpu-compute/exec_stage.hh"
+#include "gpu-compute/scheduler.hh"
+#include "gpu-compute/scoreboard_check_stage.hh"
+
+// Schedule or execution arbitration stage.
+// From the pool of ready waves in the ready list,
+// one wave is selected for each execution resource.
+// The selection is made based on a scheduling policy
+
+class ComputeUnit;
+class Wavefront;
+
+struct ComputeUnitParams;
+
+class ScheduleStage
+{
+ public:
+ ScheduleStage(const ComputeUnitParams *params);
+ ~ScheduleStage();
+ void init(ComputeUnit *cu);
+ void exec();
+ void arbitrate();
+ // Stats related variables and methods
+ std::string name() { return _name; }
+ void regStats();
+
+ private:
+ ComputeUnit *computeUnit;
+ uint32_t numSIMDs;
+ uint32_t numMemUnits;
+
+ // Each execution resource will have its own
+ // scheduler and a dispatch list
+ std::vector<Scheduler> scheduler;
+
+ // Stores the status of waves. A READY implies the
+ // wave is ready to be scheduled this cycle and
+ // is already present in the readyList
+ std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
+ waveStatusList;
+
+ // List of waves which will be dispatched to
+ // each execution resource. A FILLED implies
+ // dispatch list is non-empty and
+ // execution unit has something to execute
+ // this cycle. Currently, the dispatch list of
+ // an execution resource can hold only one wave because
+ // an execution resource can execute only one wave in a cycle.
+ std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
+
+ std::string _name;
+};
+
+#endif // __SCHEDULE_STAGE_HH__
diff --git a/src/gpu-compute/scheduler.cc b/src/gpu-compute/scheduler.cc
new file mode 100644
index 000000000..1cd0bfe55
--- /dev/null
+++ b/src/gpu-compute/scheduler.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/scheduler.hh"
+
+Scheduler::Scheduler(const ComputeUnitParams *p)
+{
+ if (p->execPolicy == "OLDEST-FIRST") {
+ schedPolicy = SCHED_POLICY::OF_POLICY;
+ } else if (p->execPolicy == "ROUND-ROBIN") {
+ schedPolicy = SCHED_POLICY::RR_POLICY;
+ } else {
+ fatal("Unimplemented scheduling policy");
+ }
+}
+
+Wavefront*
+Scheduler::chooseWave()
+{
+ if (schedPolicy == SCHED_POLICY::OF_POLICY) {
+ return OFSchedPolicy.chooseWave();
+ } else if (schedPolicy == SCHED_POLICY::RR_POLICY) {
+ return RRSchedPolicy.chooseWave();
+ } else {
+ fatal("Unimplemented scheduling policy");
+ }
+}
+
+void
+Scheduler::bindList(std::vector<Wavefront*> *list)
+{
+ if (schedPolicy == SCHED_POLICY::OF_POLICY) {
+ OFSchedPolicy.bindList(list);
+ } else if (schedPolicy == SCHED_POLICY::RR_POLICY) {
+ RRSchedPolicy.bindList(list);
+ } else {
+ fatal("Unimplemented scheduling policy");
+ }
+}
diff --git a/src/gpu-compute/scheduler.hh b/src/gpu-compute/scheduler.hh
new file mode 100644
index 000000000..148ec9425
--- /dev/null
+++ b/src/gpu-compute/scheduler.hh
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCHEDULER_HH__
+#define __SCHEDULER_HH__
+
+#include "gpu-compute/of_scheduling_policy.hh"
+#include "gpu-compute/rr_scheduling_policy.hh"
+#include "gpu-compute/scheduling_policy.hh"
+#include "params/ComputeUnit.hh"
+
+enum SCHED_POLICY
+{
+ OF_POLICY = 0,
+ RR_POLICY
+};
+
+class Scheduler
+{
+ public:
+ Scheduler(const ComputeUnitParams *params);
+ Wavefront *chooseWave();
+ void bindList(std::vector<Wavefront*> *list);
+
+ private:
+ SCHED_POLICY schedPolicy;
+ SchedulingPolicy<RRSchedulingPolicy> RRSchedPolicy;
+ SchedulingPolicy<OFSchedulingPolicy> OFSchedPolicy;
+};
+
+#endif // __SCHEDULER_HH__
diff --git a/src/gpu-compute/scheduling_policy.hh b/src/gpu-compute/scheduling_policy.hh
new file mode 100644
index 000000000..b5e923c62
--- /dev/null
+++ b/src/gpu-compute/scheduling_policy.hh
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCHEDULING_POLICY_HH__
+#define __SCHEDULING_POLICY_HH__
+
+#include <vector>
+
+template<typename Impl>
+class SchedulingPolicy
+{
+ public:
+ Wavefront* chooseWave() { return policyImpl.chooseWave(); }
+
+ void
+ bindList(std::vector<Wavefront*> *list)
+ {
+ return policyImpl.bindList(list);
+ }
+
+ private:
+ Impl policyImpl;
+};
+
+#endif // __SCHEDULING_POLICY_HH__
diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc
new file mode 100644
index 000000000..0d856a9b0
--- /dev/null
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/scoreboard_check_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/ComputeUnit.hh"
+
+ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p)
+ : numSIMDs(p->num_SIMDs),
+ numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
+ numGlbMemPipes(p->num_global_mem_pipes),
+ numShrMemPipes(p->num_shared_mem_pipes),
+ vectorAluInstAvail(nullptr),
+ lastGlbMemSimd(-1),
+ lastShrMemSimd(-1), glbMemInstAvail(nullptr),
+ shrMemInstAvail(nullptr)
+{
+}
+
+ScoreboardCheckStage::~ScoreboardCheckStage()
+{
+ readyList.clear();
+ waveStatusList.clear();
+ shrMemInstAvail = nullptr;
+ glbMemInstAvail = nullptr;
+}
+
+void
+ScoreboardCheckStage::init(ComputeUnit *cu)
+{
+ computeUnit = cu;
+ _name = computeUnit->name() + ".ScoreboardCheckStage";
+
+ for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
+ readyList.push_back(&computeUnit->readyList[unitId]);
+ }
+
+ for (int unitId = 0; unitId < numSIMDs; ++unitId) {
+ waveStatusList.push_back(&computeUnit->waveStatusList[unitId]);
+ }
+
+ vectorAluInstAvail = &computeUnit->vectorAluInstAvail;
+ glbMemInstAvail= &computeUnit->glbMemInstAvail;
+ shrMemInstAvail= &computeUnit->shrMemInstAvail;
+}
+
+void
+ScoreboardCheckStage::initStatistics()
+{
+ lastGlbMemSimd = -1;
+ lastShrMemSimd = -1;
+ *glbMemInstAvail = 0;
+ *shrMemInstAvail = 0;
+
+ for (int unitId = 0; unitId < numSIMDs; ++unitId)
+ vectorAluInstAvail->at(unitId) = false;
+}
+
+void
+ScoreboardCheckStage::collectStatistics(Wavefront *curWave, int unitId)
+{
+ if (curWave->instructionBuffer.empty())
+ return;
+
+ // track which vector SIMD unit has at least one WV with a vector
+ // ALU as the oldest instruction in its Instruction buffer
+ vectorAluInstAvail->at(unitId) = vectorAluInstAvail->at(unitId) ||
+ curWave->isOldestInstALU();
+
+ // track how many vector SIMD units have at least one WV with a
+ // vector Global memory instruction as the oldest instruction
+ // in its Instruction buffer
+ if ((curWave->isOldestInstGMem() || curWave->isOldestInstPrivMem() ||
+ curWave->isOldestInstFlatMem()) && lastGlbMemSimd != unitId &&
+ *glbMemInstAvail <= 1) {
+ (*glbMemInstAvail)++;
+ lastGlbMemSimd = unitId;
+ }
+
+ // track how many vector SIMD units have at least one WV with a
+ // vector shared memory (LDS) instruction as the oldest instruction
+ // in its Instruction buffer
+ // TODO: parametrize the limit of the LDS units
+ if (curWave->isOldestInstLMem() && (*shrMemInstAvail <= numShrMemPipes) &&
+ lastShrMemSimd != unitId) {
+ (*shrMemInstAvail)++;
+ lastShrMemSimd = unitId;
+ }
+}
+
+void
+ScoreboardCheckStage::exec()
+{
+ initStatistics();
+
+ // reset the ready list for all execution units; it will be
+ // constructed every cycle since resource availability may change
+ for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
+ readyList[unitId]->clear();
+ }
+
+ // iterate over the Wavefronts of all SIMD units
+ for (int unitId = 0; unitId < numSIMDs; ++unitId) {
+ for (int wvId = 0; wvId < computeUnit->shader->n_wf; ++wvId) {
+ // reset the ready status of each wavefront
+ waveStatusList[unitId]->at(wvId).second = BLOCKED;
+ Wavefront *curWave = waveStatusList[unitId]->at(wvId).first;
+ collectStatistics(curWave, unitId);
+
+ if (curWave->ready(Wavefront::I_ALU)) {
+ readyList[unitId]->push_back(curWave);
+ waveStatusList[unitId]->at(wvId).second = READY;
+ } else if (curWave->ready(Wavefront::I_GLOBAL)) {
+ if (computeUnit->cedeSIMD(unitId, wvId)) {
+ continue;
+ }
+
+ readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
+ waveStatusList[unitId]->at(wvId).second = READY;
+ } else if (curWave->ready(Wavefront::I_SHARED)) {
+ readyList[computeUnit->ShrMemUnitId()]->push_back(curWave);
+ waveStatusList[unitId]->at(wvId).second = READY;
+ } else if (curWave->ready(Wavefront::I_FLAT)) {
+ readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
+ waveStatusList[unitId]->at(wvId).second = READY;
+ } else if (curWave->ready(Wavefront::I_PRIVATE)) {
+ readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
+ waveStatusList[unitId]->at(wvId).second = READY;
+ }
+ }
+ }
+}
+
+void
+ScoreboardCheckStage::regStats()
+{
+}
diff --git a/src/gpu-compute/scoreboard_check_stage.hh b/src/gpu-compute/scoreboard_check_stage.hh
new file mode 100644
index 000000000..099597afb
--- /dev/null
+++ b/src/gpu-compute/scoreboard_check_stage.hh
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCOREBOARD_CHECK_STAGE_HH__
+#define __SCOREBOARD_CHECK_STAGE_HH__
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+class ComputeUnit;
+class Wavefront;
+
+struct ComputeUnitParams;
+
+enum WAVE_STATUS
+{
+ BLOCKED = 0,
+ READY
+};
+
+/*
+ * Scoreboard check stage.
+ * All wavefronts are analyzed to see if they are ready
+ * to be executed this cycle. Both structural and data
+ * hazards are considered while marking a wave "ready"
+ * for execution. After analysis, the ready waves are
+ * added to readyList.
+ */
+class ScoreboardCheckStage
+{
+ public:
+ ScoreboardCheckStage(const ComputeUnitParams* params);
+ ~ScoreboardCheckStage();
+ void init(ComputeUnit *cu);
+ void exec();
+
+ // Stats related variables and methods
+ const std::string& name() const { return _name; }
+ void regStats();
+
+ private:
+ void collectStatistics(Wavefront *curWave, int unitId);
+ void initStatistics();
+ ComputeUnit *computeUnit;
+ uint32_t numSIMDs;
+ uint32_t numMemUnits;
+ uint32_t numGlbMemPipes;
+ uint32_t numShrMemPipes;
+
+ // flag per vector SIMD unit that is set when there is at least one
+ // WF that has a vector ALU instruction as the oldest in its
+ // Instruction Buffer
+ std::vector<bool> *vectorAluInstAvail;
+ int lastGlbMemSimd;
+ int lastShrMemSimd;
+
+ int *glbMemInstAvail;
+ int *shrMemInstAvail;
+ // List of waves which are ready to be scheduled.
+ // Each execution resource has a ready list
+ std::vector<std::vector<Wavefront*>*> readyList;
+
+ // Stores the status of waves. A READY implies the
+ // wave is ready to be scheduled this cycle and
+ // is already present in the readyList
+ std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
+ waveStatusList;
+
+ std::string _name;
+};
+
+#endif // __SCOREBOARD_CHECK_STAGE_HH__
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
new file mode 100644
index 000000000..e8d7946ff
--- /dev/null
+++ b/src/gpu-compute/shader.cc
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "gpu-compute/shader.hh"
+
+#include <limits>
+
+#include "arch/x86/linux/linux.hh"
+#include "base/chunk_generator.hh"
+#include "debug/GPUDisp.hh"
+#include "debug/GPUMem.hh"
+#include "debug/HSAIL.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "sim/sim_exit.hh"
+
+Shader::Shader(const Params *p) : SimObject(p),
+ clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr),
+ cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing),
+ hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync),
+ separate_acquire_release(p->separate_acquire_release), coissue_return(1),
+ trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
+ globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
+ box_tick_cnt(0), start_tick_cnt(0)
+{
+
+ cuList.resize(n_cu);
+
+ for (int i = 0; i < n_cu; ++i) {
+ cuList[i] = p->CUs[i];
+ assert(i == cuList[i]->cu_id);
+ cuList[i]->shader = this;
+ }
+}
+
+Addr
+Shader::mmap(int length)
+{
+
+ Addr start;
+
+ // round up length to the next page
+ length = roundUp(length, TheISA::PageBytes);
+
+ if (X86Linux64::mmapGrowsDown()) {
+ DPRINTF(HSAIL, "GROWS DOWN");
+ start = gpuTc->getProcessPtr()->mmap_end -length;
+ gpuTc->getProcessPtr()->mmap_end = start;
+ } else {
+ DPRINTF(HSAIL, "GROWS UP");
+ start = gpuTc->getProcessPtr()->mmap_end;
+ gpuTc->getProcessPtr()->mmap_end += length;
+
+ // assertion to make sure we don't overwrite the stack (it grows down)
+ assert(gpuTc->getProcessPtr()->mmap_end <
+ gpuTc->getProcessPtr()->stack_base -
+ gpuTc->getProcessPtr()->max_stack_size);
+
+ }
+
+ DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
+
+ gpuTc->getProcessPtr()->allocateMem(start,length);
+
+ return start;
+}
+
+void
+Shader::init()
+{
+ // grab the threadContext of the thread running on the CPU
+ assert(cpuPointer);
+ gpuTc = cpuPointer->getContext(0);
+ assert(gpuTc);
+}
+
+Shader::~Shader()
+{
+ for (int j = 0; j < n_cu; ++j)
+ delete cuList[j];
+}
+
+void
+Shader::updateThreadContext(int tid) {
+ // thread context of the thread which dispatched work
+ assert(cpuPointer);
+ gpuTc = cpuPointer->getContext(tid);
+ assert(gpuTc);
+}
+
+void
+Shader::hostWakeUp(BaseCPU *cpu) {
+ if (cpuPointer == cpu) {
+ if (gpuTc->status() == ThreadContext::Suspended)
+ cpu->activateContext(gpuTc->threadId());
+ } else {
+ //Make sure both dispatcher and shader are trying to
+ //wakeup same host. Hack here to enable kernel launch
+ //from multiple CPUs
+ panic("Dispatcher wants to wakeup a different host");
+ }
+}
+
+Shader*
+ShaderParams::create()
+{
+ return new Shader(this);
+}
+
+void
+Shader::exec()
+{
+ tick_cnt = curTick();
+ box_tick_cnt = curTick() - start_tick_cnt;
+
+ // apply any scheduled adds
+ for (int i = 0; i < sa_n; ++i) {
+ if (sa_when[i] <= tick_cnt) {
+ *sa_val[i] += sa_x[i];
+ sa_val.erase(sa_val.begin() + i);
+ sa_x.erase(sa_x.begin() + i);
+ sa_when.erase(sa_when.begin() + i);
+ --sa_n;
+ --i;
+ }
+ }
+
+ // clock all of the cu's
+ for (int i = 0; i < n_cu; ++i)
+ cuList[i]->exec();
+}
+
+bool
+Shader::dispatch_workgroups(NDRange *ndr)
+{
+ bool scheduledSomething = false;
+ int cuCount = 0;
+ int curCu = nextSchedCu;
+
+ while (cuCount < n_cu) {
+ //Every time we try a CU, update nextSchedCu
+ nextSchedCu = (nextSchedCu + 1) % n_cu;
+
+ // dispatch workgroup iff the following two conditions are met:
+ // (a) wg_rem is true - there are unassigned workgroups in the grid
+ // (b) there are enough free slots in cu cuList[i] for this wg
+ if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
+ scheduledSomething = true;
+ DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
+
+ // ticks() member function translates cycles to simulation ticks.
+ if (!tickEvent.scheduled()) {
+ schedule(tickEvent, curTick() + this->ticks(1));
+ }
+
+ cuList[curCu]->StartWorkgroup(ndr);
+ ndr->wgId[0]++;
+ ndr->globalWgId++;
+ if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
+ ndr->wgId[0] = 0;
+ ndr->wgId[1]++;
+
+ if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
+ ndr->wgId[1] = 0;
+ ndr->wgId[2]++;
+
+ if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
+ ndr->wg_disp_rem = false;
+ break;
+ }
+ }
+ }
+ }
+
+ ++cuCount;
+ curCu = nextSchedCu;
+ }
+
+ return scheduledSomething;
+}
+
+void
+Shader::handshake(GpuDispatcher *_dispatcher)
+{
+ dispatcher = _dispatcher;
+}
+
+void
+Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
+ bool suppress_func_errors, int cu_id)
+{
+ unsigned block_size = RubySystem::getBlockSizeBytes();
+ unsigned size = req->getSize();
+
+ Addr tmp_addr;
+ BaseTLB::Mode trans_mode;
+
+ if (cmd == MemCmd::ReadReq) {
+ trans_mode = BaseTLB::Read;
+ } else if (cmd == MemCmd::WriteReq) {
+ trans_mode = BaseTLB::Write;
+ } else {
+ fatal("unexcepted MemCmd\n");
+ }
+
+ tmp_addr = req->getVaddr();
+ Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
+
+ assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
+
+ // Misaligned access
+ if (split_addr > tmp_addr) {
+ RequestPtr req1, req2;
+ req->splitOnVaddr(split_addr, req1, req2);
+
+
+ PacketPtr pkt1 = new Packet(req2, cmd);
+ PacketPtr pkt2 = new Packet(req1, cmd);
+
+ functionalTLBAccess(pkt1, cu_id, trans_mode);
+ functionalTLBAccess(pkt2, cu_id, trans_mode);
+
+ PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
+ PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
+
+ new_pkt1->dataStatic(data);
+ new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
+
+ if (suppress_func_errors) {
+ new_pkt1->setSuppressFuncError();
+ new_pkt2->setSuppressFuncError();
+ }
+
+ // fixme: this should be cuList[cu_id] if cu_id != n_cu
+ // The latter requires a memPort in the dispatcher
+ cuList[0]->memPort[0]->sendFunctional(new_pkt1);
+ cuList[0]->memPort[0]->sendFunctional(new_pkt2);
+
+ delete new_pkt1;
+ delete new_pkt2;
+ delete pkt1;
+ delete pkt2;
+ } else {
+ PacketPtr pkt = new Packet(req, cmd);
+ functionalTLBAccess(pkt, cu_id, trans_mode);
+ PacketPtr new_pkt = new Packet(pkt->req, cmd);
+ new_pkt->dataStatic(data);
+
+ if (suppress_func_errors) {
+ new_pkt->setSuppressFuncError();
+ };
+
+ // fixme: this should be cuList[cu_id] if cu_id != n_cu
+ // The latter requires a memPort in the dispatcher
+ cuList[0]->memPort[0]->sendFunctional(new_pkt);
+
+ delete new_pkt;
+ delete pkt;
+ }
+}
+
+bool
+Shader::busy()
+{
+ for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
+ if (!cuList[i_cu]->isDone()) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void
+Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
+{
+ sa_val.push_back(val);
+ sa_when.push_back(tick_cnt + when);
+ sa_x.push_back(x);
+ ++sa_n;
+}
+
+Shader::TickEvent::TickEvent(Shader *_shader)
+ : Event(CPU_Tick_Pri), shader(_shader)
+{
+}
+
+
+void
+Shader::TickEvent::process()
+{
+ if (shader->busy()) {
+ shader->exec();
+ shader->schedule(this, curTick() + shader->ticks(1));
+ }
+}
+
+const char*
+Shader::TickEvent::description() const
+{
+ return "Shader tick";
+}
+
+void
+Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+ MemCmd cmd, bool suppress_func_errors)
+{
+ uint8_t *data_buf = (uint8_t*)ptr;
+
+ for (ChunkGenerator gen(address, size, RubySystem::getBlockSizeBytes());
+ !gen.done(); gen.next()) {
+ Request *req = new Request(0, gen.addr(), gen.size(), 0,
+ cuList[0]->masterId(), 0, 0, 0);
+
+ doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
+ data_buf += gen.size();
+ delete req;
+ }
+}
+
+void
+Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
+{
+ AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
+}
+
+void
+Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+ bool suppress_func_errors)
+{
+ AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
+}
+
+void
+Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
+{
+ AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
+}
+
+void
+Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+ bool suppress_func_errors)
+{
+ AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
+ suppress_func_errors);
+}
+
+/*
+ * Send a packet through the appropriate TLB functional port.
+ * If cu_id=n_cu, then this is the dispatcher's TLB.
+ * Otherwise it's the TLB of the cu_id compute unit.
+ */
+void
+Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
+{
+ // update senderState. Need to know the gpuTc and the TLB mode
+ pkt->senderState =
+ new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
+
+ if (cu_id == n_cu) {
+ dispatcher->tlbPort->sendFunctional(pkt);
+ } else {
+ // even when the perLaneTLB flag is turned on
+ // it's ok tp send all accesses through lane 0
+ // since the lane # is not known here,
+ // This isn't important since these are functional accesses.
+ cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
+ }
+
+ /* safe_cast the senderState */
+ TheISA::GpuTLB::TranslationState *sender_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ delete sender_state->tlbEntry;
+ delete pkt->senderState;
+}
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
new file mode 100644
index 000000000..91ea8aae0
--- /dev/null
+++ b/src/gpu-compute/shader.hh
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __SHADER_HH__
+#define __SHADER_HH__
+
+#include <functional>
+#include <string>
+
+#include "arch/isa.hh"
+#include "arch/isa_traits.hh"
+#include "base/types.hh"
+#include "cpu/simple/atomic.hh"
+#include "cpu/simple/timing.hh"
+#include "cpu/simple_thread.hh"
+#include "cpu/thread_context.hh"
+#include "cpu/thread_state.hh"
+#include "enums/MemOpType.hh"
+#include "enums/MemType.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_tlb.hh"
+#include "gpu-compute/lds_state.hh"
+#include "gpu-compute/qstruct.hh"
+#include "mem/page_table.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/Shader.hh"
+#include "sim/faults.hh"
+#include "sim/process.hh"
+#include "sim/sim_object.hh"
+
+class BaseTLB;
+class GpuDispatcher;
+
+namespace TheISA
+{
+ class GpuTLB;
+}
+
+static const int LDS_SIZE = 65536;
+
+// Class Shader: This describes a single shader instance. Most
+// configurations will only have a single shader.
+
+class Shader : public SimObject
+{
+ protected:
+ // Shader's clock period in terms of number of ticks of curTime,
+ // aka global simulation clock
+ Tick clock;
+
+ public:
+ typedef ShaderParams Params;
+ enum hsail_mode_e {SIMT,VECTOR_SCALAR};
+
+ // clock related functions ; maps to-and-from
+ // Simulation ticks and shader clocks.
+ Tick frequency() const { return SimClock::Frequency / clock; }
+
+ Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
+
+ Tick getClock() const { return clock; }
+ Tick curCycle() const { return curTick() / clock; }
+ Tick tickToCycles(Tick val) const { return val / clock;}
+
+
+ SimpleThread *cpuThread;
+ ThreadContext *gpuTc;
+ BaseCPU *cpuPointer;
+
+ class TickEvent : public Event
+ {
+ private:
+ Shader *shader;
+
+ public:
+ TickEvent(Shader*);
+ void process();
+ const char* description() const;
+ };
+
+ TickEvent tickEvent;
+
+ // is this simulation going to be timing mode in the memory?
+ bool timingSim;
+ hsail_mode_e hsail_mode;
+
+ // If set, issue acq packet @ kernel launch
+ int impl_kern_boundary_sync;
+ // If set, generate a separate packet for acquire/release on
+ // ld_acquire/st_release/atomic operations
+ int separate_acquire_release;
+ // If set, fetch returns may be coissued with instructions
+ int coissue_return;
+ // If set, always dump all 64 gprs to trace
+ int trace_vgpr_all;
+ // Number of cu units in the shader
+ int n_cu;
+ // Number of wavefront slots per cu
+ int n_wf;
+ // The size of global memory
+ int globalMemSize;
+
+ /*
+ * Bytes/work-item for call instruction
+ * The number of arguments for an hsail function will
+ * vary. We simply determine the maximum # of arguments
+ * required by any hsail function up front before the
+ * simulation (during parsing of the Brig) and record
+ * that number here.
+ */
+ int funcargs_size;
+
+ // Tracks CU that rr dispatcher should attempt scheduling
+ int nextSchedCu;
+
+ // Size of scheduled add queue
+ uint32_t sa_n;
+
+ // Pointer to value to be increments
+ std::vector<uint32_t*> sa_val;
+ // When to do the increment
+ std::vector<uint64_t> sa_when;
+ // Amount to increment by
+ std::vector<int32_t> sa_x;
+
+ // List of Compute Units (CU's)
+ std::vector<ComputeUnit*> cuList;
+
+ uint64_t tick_cnt;
+ uint64_t box_tick_cnt;
+ uint64_t start_tick_cnt;
+
+ GpuDispatcher *dispatcher;
+
+ Shader(const Params *p);
+ ~Shader();
+ virtual void init();
+
+ // Run shader
+ void exec();
+
+ // Check to see if shader is busy
+ bool busy();
+
+ // Schedule a 32-bit value to be incremented some time in the future
+ void ScheduleAdd(uint32_t *val, Tick when, int x);
+ bool processTimingPacket(PacketPtr pkt);
+
+ void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+ MemCmd cmd, bool suppress_func_errors);
+
+ void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
+
+ void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
+ bool suppress_func_errors);
+
+ void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
+
+ void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
+ bool suppress_func_errors);
+
+ void doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
+ bool suppress_func_errors, int cu_id);
+
+ void
+ registerCU(int cu_id, ComputeUnit *compute_unit)
+ {
+ cuList[cu_id] = compute_unit;
+ }
+
+ void handshake(GpuDispatcher *dispatcher);
+ bool dispatch_workgroups(NDRange *ndr);
+ Addr mmap(int length);
+ void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
+ void updateThreadContext(int tid);
+ void hostWakeUp(BaseCPU *cpu);
+};
+
+#endif // __SHADER_HH__
diff --git a/src/gpu-compute/simple_pool_manager.cc b/src/gpu-compute/simple_pool_manager.cc
new file mode 100644
index 000000000..0e35ab9cc
--- /dev/null
+++ b/src/gpu-compute/simple_pool_manager.cc
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/simple_pool_manager.hh"
+
+#include "base/misc.hh"
+
+// return the min number of elements that the manager can reserve given
+// a request for "size" elements
+uint32_t
+SimplePoolManager::minAllocatedElements(uint32_t size)
+{
+ fatal_if(size <= 0 || size > poolSize(), "Illegal VGPR region size=%d\n",
+ size);
+
+ return size % minAllocation() > 0 ?
+ (minAllocation() - (size % minAllocation())) + size : size;
+}
+
+std::string
+SimplePoolManager::printRegion()
+{
+ std::string _cout;
+ if (_reservedGroups == 0)
+ _cout = "VRF is empty\n";
+ else if (_reservedGroups > 0) {
+ uint32_t reservedEntries = _reservedGroups * _regionSize;
+ _cout = "VRF reserves " + std::to_string(reservedEntries) + " VGPRs\n";
+ }
+
+ return _cout;
+}
+
+bool
+SimplePoolManager::canAllocate(uint32_t numRegions, uint32_t size)
+{
+ assert(numRegions * minAllocatedElements(size) <= poolSize());
+
+ return _reservedGroups == 0;
+}
+
+void
+SimplePoolManager::freeRegion(uint32_t firstIdx, uint32_t lastIdx)
+{
+ assert(_reservedGroups > 0);
+ --_reservedGroups;
+
+ if (!_reservedGroups)
+ _nxtFreeIdx = 0;
+}
+
+uint32_t
+SimplePoolManager::allocateRegion(const uint32_t size,
+ uint32_t *reservedPoolSize)
+{
+ uint32_t actualSize = minAllocatedElements(size);
+ uint32_t startIdx = _nxtFreeIdx;
+ _nxtFreeIdx += actualSize;
+ _regionSize = actualSize;
+ assert(_nxtFreeIdx < poolSize());
+ *reservedPoolSize = actualSize;
+ ++_reservedGroups;
+
+ return startIdx;
+}
+
+uint32_t
+SimplePoolManager::regionSize(std::pair<uint32_t, uint32_t> &region)
+{
+ bool wrapAround = (region.first > region.second);
+ if (!wrapAround) {
+ return region.second - region.first + 1;
+ } else {
+ return region.second + poolSize() - region.first + 1;
+ }
+}
diff --git a/src/gpu-compute/simple_pool_manager.hh b/src/gpu-compute/simple_pool_manager.hh
new file mode 100644
index 000000000..1d4174da8
--- /dev/null
+++ b/src/gpu-compute/simple_pool_manager.hh
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __SIMPLE_POOL_MANAGER_HH__
+#define __SIMPLE_POOL_MANAGER_HH__
+
+#include <cassert>
+#include <cstdint>
+
+#include "gpu-compute/pool_manager.hh"
+
+// Simple Pool Manager: allows one region per pool. No region merging is
+// supported.
+class SimplePoolManager : public PoolManager
+{
+ public:
+ SimplePoolManager(uint32_t minAlloc, uint32_t poolSize)
+ : PoolManager(minAlloc, poolSize), _regionSize(0), _nxtFreeIdx(0),
+ _reservedGroups(0)
+ {
+ }
+
+ uint32_t minAllocatedElements(uint32_t size);
+ std::string printRegion();
+ bool canAllocate(uint32_t numRegions, uint32_t size);
+ uint32_t allocateRegion(const uint32_t size, uint32_t *reservedPoolSize);
+ void freeRegion(uint32_t firstIdx, uint32_t lastIdx);
+ uint32_t regionSize(std::pair<uint32_t,uint32_t> &region);
+
+ private:
+ // actual size of a region (normalized to the minimum size that can
+ // be reserved)
+ uint32_t _regionSize;
+ // next index to allocate a region
+ uint8_t _nxtFreeIdx;
+ // number of groups that reserve a region
+ uint32_t _reservedGroups;
+};
+
+#endif // __SIMPLE_POOL_MANAGER_HH__
diff --git a/src/gpu-compute/tlb_coalescer.cc b/src/gpu-compute/tlb_coalescer.cc
new file mode 100644
index 000000000..835d7b740
--- /dev/null
+++ b/src/gpu-compute/tlb_coalescer.cc
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#include "gpu-compute/tlb_coalescer.hh"
+
+#include <cstring>
+
+#include "debug/GPUTLB.hh"
+
+TLBCoalescer::TLBCoalescer(const Params *p) : MemObject(p),
+ clock(p->clk_domain->clockPeriod()), TLBProbesPerCycle(p->probesPerCycle),
+ coalescingWindow(p->coalescingWindow),
+ disableCoalescing(p->disableCoalescing), probeTLBEvent(this),
+ cleanupEvent(this)
+{
+ // create the slave ports based on the number of connected ports
+ for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
+ cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
+ this, i));
+ }
+
+ // create the master ports based on the number of connected ports
+ for (size_t i = 0; i < p->port_master_connection_count; ++i) {
+ memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
+ this, i));
+ }
+}
+
+BaseSlavePort&
+TLBCoalescer::getSlavePort(const std::string &if_name, PortID idx)
+{
+ if (if_name == "slave") {
+ if (idx >= static_cast<PortID>(cpuSidePort.size())) {
+ panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx);
+ }
+
+ return *cpuSidePort[idx];
+ } else {
+ panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name);
+ }
+}
+
+BaseMasterPort&
+TLBCoalescer::getMasterPort(const std::string &if_name, PortID idx)
+{
+ if (if_name == "master") {
+ if (idx >= static_cast<PortID>(memSidePort.size())) {
+ panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx);
+ }
+
+ return *memSidePort[idx];
+ } else {
+ panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name);
+ }
+}
+
+/*
+ * This method returns true if the <incoming_pkt>
+ * can be coalesced with <coalesced_pkt> and false otherwise.
+ * A given set of rules is checked.
+ * The rules can potentially be modified based on the TLB level.
+ */
+bool
+TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
+{
+ if (disableCoalescing)
+ return false;
+
+ TheISA::GpuTLB::TranslationState *incoming_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState);
+
+ TheISA::GpuTLB::TranslationState *coalesced_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState);
+
+ // Rule 1: Coalesce requests only if they
+ // fall within the same virtual page
+ Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
+ TheISA::PageBytes);
+
+ Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
+ TheISA::PageBytes);
+
+ if (incoming_virt_page_addr != coalesced_virt_page_addr)
+ return false;
+
+ //* Rule 2: Coalesce requests only if they
+ // share a TLB Mode, i.e. they are both read
+ // or write requests.
+ BaseTLB::Mode incoming_mode = incoming_state->tlbMode;
+ BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode;
+
+ if (incoming_mode != coalesced_mode)
+ return false;
+
+ // when we can coalesce a packet update the reqCnt
+ // that is the number of packets represented by
+ // this coalesced packet
+ if (!incoming_state->prefetch)
+ coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
+
+ return true;
+}
+
+/*
+ * We need to update the physical addresses of all the translation requests
+ * that were coalesced into the one that just returned.
+ */
+void
+TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
+{
+ Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
+
+ DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
+ issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
+
+ TheISA::GpuTLB::TranslationState *sender_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ TheISA::GpuTlbEntry *tlb_entry = sender_state->tlbEntry;
+ assert(tlb_entry);
+ Addr first_entry_vaddr = tlb_entry->vaddr;
+ Addr first_entry_paddr = tlb_entry->paddr;
+ int page_size = tlb_entry->size();
+ bool uncacheable = tlb_entry->uncacheable;
+ int first_hit_level = sender_state->hitLevel;
+ bool valid = tlb_entry->valid;
+
+ // Get the physical page address of the translated request
+ // Using the page_size specified in the TLBEntry allows us
+ // to support different page sizes.
+ Addr phys_page_paddr = pkt->req->getPaddr();
+ phys_page_paddr &= ~(page_size - 1);
+
+ for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
+ PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
+ TheISA::GpuTLB::TranslationState *sender_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(
+ local_pkt->senderState);
+
+ // we are sending the packet back, so pop the reqCnt associated
+ // with this level in the TLB hiearchy
+ if (!sender_state->prefetch)
+ sender_state->reqCnt.pop_back();
+
+ /*
+ * Only the first packet from this coalesced request has been
+ * translated. Grab the translated phys. page addr and update the
+ * physical addresses of the remaining packets with the appropriate
+ * page offsets.
+ */
+ if (i) {
+ Addr paddr = phys_page_paddr;
+ paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
+ local_pkt->req->setPaddr(paddr);
+
+ if (uncacheable)
+ local_pkt->req->setFlags(Request::UNCACHEABLE);
+
+ // update senderState->tlbEntry, so we can insert
+ // the correct TLBEentry in the TLBs above.
+ sender_state->tlbEntry =
+ new TheISA::GpuTlbEntry(0, first_entry_vaddr, first_entry_paddr,
+ valid);
+
+ // update the hitLevel for all uncoalesced reqs
+ // so that each packet knows where it hit
+ // (used for statistics in the CUs)
+ sender_state->hitLevel = first_hit_level;
+ }
+
+ SlavePort *return_port = sender_state->ports.back();
+ sender_state->ports.pop_back();
+
+ // Translation is done - Convert to a response pkt if necessary and
+ // send the translation back
+ if (local_pkt->isRequest()) {
+ local_pkt->makeTimingResponse();
+ }
+
+ return_port->sendTimingResp(local_pkt);
+ }
+
+ // schedule clean up for end of this cycle
+ // This is a maximum priority event and must be on
+ // the same cycle as GPUTLB cleanup event to prevent
+ // race conditions with an IssueProbeEvent caused by
+ // MemSidePort::recvReqRetry
+ cleanupQueue.push(virt_page_addr);
+
+ if (!cleanupEvent.scheduled())
+ schedule(cleanupEvent, curTick());
+}
+
+// Receive translation requests, create a coalesced request,
+// and send them to the TLB (TLBProbesPerCycle)
+bool
+TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
+{
+ // first packet of a coalesced request
+ PacketPtr first_packet = nullptr;
+ // true if we are able to do coalescing
+ bool didCoalesce = false;
+ // number of coalesced reqs for a given window
+ int coalescedReq_cnt = 0;
+
+ TheISA::GpuTLB::TranslationState *sender_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ // push back the port to remember the path back
+ sender_state->ports.push_back(this);
+
+ bool update_stats = !sender_state->prefetch;
+
+ if (update_stats) {
+ // if reqCnt is empty then this packet does not represent
+ // multiple uncoalesced reqs(pkts) but just a single pkt.
+ // If it does though then the reqCnt for each level in the
+ // hierarchy accumulates the total number of reqs this packet
+ // represents
+ int req_cnt = 1;
+
+ if (!sender_state->reqCnt.empty())
+ req_cnt = sender_state->reqCnt.back();
+
+ sender_state->reqCnt.push_back(req_cnt);
+
+ // update statistics
+ coalescer->uncoalescedAccesses++;
+ req_cnt = sender_state->reqCnt.back();
+ DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
+ coalescer->queuingCycles -= (curTick() * req_cnt);
+ coalescer->localqueuingCycles -= curTick();
+ }
+
+ // FIXME if you want to coalesce not based on the issueTime
+ // of the packets (i.e., from the compute unit's perspective)
+ // but based on when they reached this coalescer then
+ // remove the following if statement and use curTick() or
+ // coalescingWindow for the tick_index.
+ if (!sender_state->issueTime)
+ sender_state->issueTime = curTick();
+
+ // The tick index is used as a key to the coalescerFIFO hashmap.
+ // It is shared by all candidates that fall within the
+ // given coalescingWindow.
+ int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
+
+ if (coalescer->coalescerFIFO.count(tick_index)) {
+ coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
+ }
+
+ // see if we can coalesce the incoming pkt with another
+ // coalesced request with the same tick_index
+ for (int i = 0; i < coalescedReq_cnt; ++i) {
+ first_packet = coalescer->coalescerFIFO[tick_index][i][0];
+
+ if (coalescer->canCoalesce(pkt, first_packet)) {
+ coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
+
+ DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
+ i, tick_index,
+ coalescer->coalescerFIFO[tick_index][i].size());
+
+ didCoalesce = true;
+ break;
+ }
+ }
+
+ // if this is the first request for this tick_index
+ // or we did not manage to coalesce, update stats
+ // and make necessary allocations.
+ if (!coalescedReq_cnt || !didCoalesce) {
+ if (update_stats)
+ coalescer->coalescedAccesses++;
+
+ std::vector<PacketPtr> new_array;
+ new_array.push_back(pkt);
+ coalescer->coalescerFIFO[tick_index].push_back(new_array);
+
+ DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
+ "push\n", tick_index,
+ coalescer->coalescerFIFO[tick_index].size());
+ }
+
+ //schedule probeTLBEvent next cycle to send the
+ //coalesced requests to the TLB
+ if (!coalescer->probeTLBEvent.scheduled()) {
+ coalescer->schedule(coalescer->probeTLBEvent,
+ curTick() + coalescer->ticks(1));
+ }
+
+ return true;
+}
+
+void
+TLBCoalescer::CpuSidePort::recvReqRetry()
+{
+ assert(false);
+}
+
+void
+TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
+{
+
+ TheISA::GpuTLB::TranslationState *sender_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ bool update_stats = !sender_state->prefetch;
+
+ if (update_stats)
+ coalescer->uncoalescedAccesses++;
+
+ // If there is a pending timing request for this virtual address
+ // print a warning message. This is a temporary caveat of
+ // the current simulator where atomic and timing requests can
+ // coexist. FIXME remove this check/warning in the future.
+ Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
+ int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
+
+ if (map_count) {
+ DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
+ "req. pending\n", virt_page_addr);
+ }
+
+ coalescer->memSidePort[0]->sendFunctional(pkt);
+}
+
+AddrRangeList
+TLBCoalescer::CpuSidePort::getAddrRanges() const
+{
+ // currently not checked by the master
+ AddrRangeList ranges;
+
+ return ranges;
+}
+
+bool
+TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt)
+{
+ // a translation completed and returned
+ coalescer->updatePhysAddresses(pkt);
+
+ return true;
+}
+
+void
+TLBCoalescer::MemSidePort::recvReqRetry()
+{
+ //we've receeived a retry. Schedule a probeTLBEvent
+ if (!coalescer->probeTLBEvent.scheduled())
+ coalescer->schedule(coalescer->probeTLBEvent,
+ curTick() + coalescer->ticks(1));
+}
+
+void
+TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt)
+{
+ fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
+}
+
+TLBCoalescer::IssueProbeEvent::IssueProbeEvent(TLBCoalescer * _coalescer)
+ : Event(CPU_Tick_Pri), coalescer(_coalescer)
+{
+}
+
+const char*
+TLBCoalescer::IssueProbeEvent::description() const
+{
+ return "Probe the TLB below";
+}
+
+/*
+ * Here we scan the coalescer FIFO and issue the max
+ * number of permitted probes to the TLB below. We
+ * permit bypassing of coalesced requests for the same
+ * tick_index.
+ *
+ * We do not access the next tick_index unless we've
+ * drained the previous one. The coalesced requests
+ * that are successfully sent are moved to the
+ * issuedTranslationsTable table (the table which keeps
+ * track of the outstanding reqs)
+ */
+void
+TLBCoalescer::IssueProbeEvent::process()
+{
+ // number of TLB probes sent so far
+ int sent_probes = 0;
+ // rejected denotes a blocking event
+ bool rejected = false;
+
+ // It is set to true either when the recvTiming of the TLB below
+ // returns false or when there is another outstanding request for the
+ // same virt. page.
+
+ DPRINTF(GPUTLB, "triggered TLBCoalescer IssueProbeEvent\n");
+
+ for (auto iter = coalescer->coalescerFIFO.begin();
+ iter != coalescer->coalescerFIFO.end() && !rejected; ) {
+ int coalescedReq_cnt = iter->second.size();
+ int i = 0;
+ int vector_index = 0;
+
+ DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
+ coalescedReq_cnt, iter->first);
+
+ while (i < coalescedReq_cnt) {
+ ++i;
+ PacketPtr first_packet = iter->second[vector_index][0];
+
+ // compute virtual page address for this request
+ Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
+ TheISA::PageBytes);
+
+ // is there another outstanding request for the same page addr?
+ int pending_reqs =
+ coalescer->issuedTranslationsTable.count(virt_page_addr);
+
+ if (pending_reqs) {
+ DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
+ "page %#x\n", virt_page_addr);
+
+ ++vector_index;
+ rejected = true;
+
+ continue;
+ }
+
+ // send the coalesced request for virt_page_addr
+ if (!coalescer->memSidePort[0]->sendTimingReq(first_packet)) {
+ DPRINTF(GPUTLB, "Failed to send TLB request for page %#x",
+ virt_page_addr);
+
+ // No need for a retries queue since we are already buffering
+ // the coalesced request in coalescerFIFO.
+ rejected = true;
+ ++vector_index;
+ } else {
+ TheISA::GpuTLB::TranslationState *tmp_sender_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>
+ (first_packet->senderState);
+
+ bool update_stats = !tmp_sender_state->prefetch;
+
+ if (update_stats) {
+ // req_cnt is total number of packets represented
+ // by the one we just sent counting all the way from
+ // the top of TLB hiearchy (i.e., from the CU)
+ int req_cnt = tmp_sender_state->reqCnt.back();
+ coalescer->queuingCycles += (curTick() * req_cnt);
+
+ DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
+ coalescer->name(), req_cnt);
+
+ // pkt_cnt is number of packets we coalesced into the one
+ // we just sent but only at this coalescer level
+ int pkt_cnt = iter->second[vector_index].size();
+ coalescer->localqueuingCycles += (curTick() * pkt_cnt);
+ }
+
+ DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
+ virt_page_addr);
+
+ //copy coalescedReq to issuedTranslationsTable
+ coalescer->issuedTranslationsTable[virt_page_addr]
+ = iter->second[vector_index];
+
+ //erase the entry of this coalesced req
+ iter->second.erase(iter->second.begin() + vector_index);
+
+ if (iter->second.empty())
+ assert(i == coalescedReq_cnt);
+
+ sent_probes++;
+ if (sent_probes == coalescer->TLBProbesPerCycle)
+ return;
+ }
+ }
+
+ //if there are no more coalesced reqs for this tick_index
+ //erase the hash_map with the first iterator
+ if (iter->second.empty()) {
+ coalescer->coalescerFIFO.erase(iter++);
+ } else {
+ ++iter;
+ }
+ }
+}
+
+TLBCoalescer::CleanupEvent::CleanupEvent(TLBCoalescer* _coalescer)
+ : Event(Maximum_Pri), coalescer(_coalescer)
+{
+}
+
+const char*
+TLBCoalescer::CleanupEvent::description() const
+{
+ return "Cleanup issuedTranslationsTable hashmap";
+}
+
+void
+TLBCoalescer::CleanupEvent::process()
+{
+ while (!coalescer->cleanupQueue.empty()) {
+ Addr cleanup_addr = coalescer->cleanupQueue.front();
+ coalescer->cleanupQueue.pop();
+ coalescer->issuedTranslationsTable.erase(cleanup_addr);
+
+ DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
+ cleanup_addr);
+ }
+}
+
+void
+TLBCoalescer::regStats()
+{
+ uncoalescedAccesses
+ .name(name() + ".uncoalesced_accesses")
+ .desc("Number of uncoalesced TLB accesses")
+ ;
+
+ coalescedAccesses
+ .name(name() + ".coalesced_accesses")
+ .desc("Number of coalesced TLB accesses")
+ ;
+
+ queuingCycles
+ .name(name() + ".queuing_cycles")
+ .desc("Number of cycles spent in queue")
+ ;
+
+ localqueuingCycles
+ .name(name() + ".local_queuing_cycles")
+ .desc("Number of cycles spent in queue for all incoming reqs")
+ ;
+
+ localLatency
+ .name(name() + ".local_latency")
+ .desc("Avg. latency over all incoming pkts")
+ ;
+
+ localLatency = localqueuingCycles / uncoalescedAccesses;
+}
+
+
+TLBCoalescer*
+TLBCoalescerParams::create()
+{
+ return new TLBCoalescer(this);
+}
+
diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/gpu-compute/tlb_coalescer.hh
new file mode 100644
index 000000000..09210148b
--- /dev/null
+++ b/src/gpu-compute/tlb_coalescer.hh
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#ifndef __TLB_COALESCER_HH__
+#define __TLB_COALESCER_HH__
+
+#include <list>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "arch/generic/tlb.hh"
+#include "arch/isa.hh"
+#include "arch/isa_traits.hh"
+#include "arch/x86/pagetable.hh"
+#include "arch/x86/regs/segment.hh"
+#include "base/misc.hh"
+#include "base/statistics.hh"
+#include "gpu-compute/gpu_tlb.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/TLBCoalescer.hh"
+
+class BaseTLB;
+class Packet;
+class ThreadContext;
+
+/**
+ * The TLBCoalescer is a MemObject sitting on the front side (CPUSide) of
+ * each TLB. It receives packets and issues coalesced requests to the
+ * TLB below it. It controls how requests are coalesced (the rules)
+ * and the permitted number of TLB probes per cycle (i.e., how many
+ * coalesced requests it feeds the TLB per cycle).
+ */
+class TLBCoalescer : public MemObject
+{
+ protected:
+ // TLB clock: will inherit clock from shader's clock period in terms
+ // of nuber of ticks of curTime (aka global simulation clock)
+ // The assignment of TLB clock from shader clock is done in the
+ // python config files.
+ int clock;
+
+ public:
+ typedef TLBCoalescerParams Params;
+ TLBCoalescer(const Params *p);
+ ~TLBCoalescer() { }
+
+ // Number of TLB probes per cycle. Parameterizable - default 2.
+ int TLBProbesPerCycle;
+
+ // Consider coalescing across that many ticks.
+ // Paraemterizable - default 1.
+ int coalescingWindow;
+
+ // Each coalesced request consists of multiple packets
+ // that all fall within the same virtual page
+ typedef std::vector<PacketPtr> coalescedReq;
+
+ // disables coalescing when true
+ bool disableCoalescing;
+
+ /*
+ * This is a hash map with <tick_index> as a key.
+ * It contains a vector of coalescedReqs per <tick_index>.
+ * Requests are buffered here until they can be issued to
+ * the TLB, at which point they are copied to the
+ * issuedTranslationsTable hash map.
+ *
+ * In terms of coalescing, we coalesce requests in a given
+ * window of x cycles by using tick_index = issueTime/x as a
+ * key, where x = coalescingWindow. issueTime is the issueTime
+ * of the pkt from the ComputeUnit's perspective, but another
+ * option is to change it to curTick(), so we coalesce based
+ * on the receive time.
+ */
+ typedef std::unordered_map<int64_t, std::vector<coalescedReq>> CoalescingFIFO;
+
+ CoalescingFIFO coalescerFIFO;
+
+ /*
+ * issuedTranslationsTabler: a hash_map indexed by virtual page
+ * address. Each hash_map entry has a vector of PacketPtr associated
+ * with it denoting the different packets that share an outstanding
+ * coalesced translation request for the same virtual page.
+ *
+ * The rules that determine which requests we can coalesce are
+ * specified in the canCoalesce() method.
+ */
+ typedef std::unordered_map<Addr, coalescedReq> CoalescingTable;
+
+ CoalescingTable issuedTranslationsTable;
+
+ // number of packets the coalescer receives
+ Stats::Scalar uncoalescedAccesses;
+ // number packets the coalescer send to the TLB
+ Stats::Scalar coalescedAccesses;
+
+ // Number of cycles the coalesced requests spend waiting in
+ // coalescerFIFO. For each packet the coalescer receives we take into
+ // account the number of all uncoalesced requests this pkt "represents"
+ Stats::Scalar queuingCycles;
+
+ // On average how much time a request from the
+ // uncoalescedAccesses that reaches the TLB
+ // spends waiting?
+ Stats::Scalar localqueuingCycles;
+ // localqueuingCycles/uncoalescedAccesses
+ Stats::Formula localLatency;
+
+ bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2);
+ void updatePhysAddresses(PacketPtr pkt);
+ void regStats();
+
+ // Clock related functions. Maps to-and-from
+ // Simulation ticks and object clocks.
+ Tick frequency() const { return SimClock::Frequency / clock; }
+ Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
+ Tick curCycle() const { return curTick() / clock; }
+ Tick tickToCycles(Tick val) const { return val / clock;}
+
+ class CpuSidePort : public SlavePort
+ {
+ public:
+ CpuSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer,
+ PortID _index)
+ : SlavePort(_name, tlb_coalescer), coalescer(tlb_coalescer),
+ index(_index) { }
+
+ protected:
+ TLBCoalescer *coalescer;
+ int index;
+
+ virtual bool recvTimingReq(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt);
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+
+ virtual void
+ recvRespRetry()
+ {
+ fatal("recvRespRetry() is not implemented in the TLB coalescer.\n");
+ }
+
+ virtual AddrRangeList getAddrRanges() const;
+ };
+
+ class MemSidePort : public MasterPort
+ {
+ public:
+ MemSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer,
+ PortID _index)
+ : MasterPort(_name, tlb_coalescer), coalescer(tlb_coalescer),
+ index(_index) { }
+
+ std::deque<PacketPtr> retries;
+
+ protected:
+ TLBCoalescer *coalescer;
+ int index;
+
+ virtual bool recvTimingResp(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt);
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+
+ virtual void
+ recvRespRetry()
+ {
+ fatal("recvRespRetry() not implemented in TLB coalescer");
+ }
+ };
+
+ // Coalescer slave ports on the cpu Side
+ std::vector<CpuSidePort*> cpuSidePort;
+ // Coalescer master ports on the memory side
+ std::vector<MemSidePort*> memSidePort;
+
+ BaseMasterPort& getMasterPort(const std::string &if_name, PortID idx);
+ BaseSlavePort& getSlavePort(const std::string &if_name, PortID idx);
+
+ class IssueProbeEvent : public Event
+ {
+ private:
+ TLBCoalescer *coalescer;
+
+ public:
+ IssueProbeEvent(TLBCoalescer *_coalescer);
+ void process();
+ const char *description() const;
+ };
+
+ // this event issues the TLB probes
+ IssueProbeEvent probeTLBEvent;
+
+ // the cleanupEvent is scheduled after a TLBEvent triggers
+ // in order to free memory and do the required clean-up
+ class CleanupEvent : public Event
+ {
+ private:
+ TLBCoalescer *coalescer;
+
+ public:
+ CleanupEvent(TLBCoalescer *_coalescer);
+ void process();
+ const char* description() const;
+ };
+
+ // schedule cleanup
+ CleanupEvent cleanupEvent;
+
+ // this FIFO queue keeps track of the virt. page
+ // addresses that are pending cleanup
+ std::queue<Addr> cleanupQueue;
+};
+
+#endif // __TLB_COALESCER_HH__
diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc
new file mode 100644
index 000000000..8b7dc0691
--- /dev/null
+++ b/src/gpu-compute/vector_register_file.cc
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/vector_register_file.hh"
+
+#include <string>
+
+#include "base/misc.hh"
+#include "gpu-compute/code_enums.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/simple_pool_manager.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/VectorRegisterFile.hh"
+
+VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p)
+ : SimObject(p),
+ manager(new SimplePoolManager(p->min_alloc, p->num_regs_per_simd)),
+ simdId(p->simd_id), numRegsPerSimd(p->num_regs_per_simd),
+ vgprState(new VecRegisterState())
+{
+ fatal_if(numRegsPerSimd % 2, "VRF size is illegal\n");
+ fatal_if(simdId < 0, "Illegal SIMD id for VRF");
+
+ fatal_if(numRegsPerSimd % p->min_alloc, "Min VGPR region allocation is not "
+ "multiple of VRF size\n");
+
+ busy.clear();
+ busy.resize(numRegsPerSimd, 0);
+ nxtBusy.clear();
+ nxtBusy.resize(numRegsPerSimd, 0);
+
+ vgprState->init(numRegsPerSimd);
+}
+
+void
+VectorRegisterFile::setParent(ComputeUnit *_computeUnit)
+{
+ computeUnit = _computeUnit;
+ vgprState->setParent(computeUnit);
+}
+
+uint8_t
+VectorRegisterFile::regNxtBusy(int idx, uint32_t operandSize) const
+{
+ uint8_t status = nxtBusy.at(idx);
+
+ if (operandSize > 4) {
+ status = status | (nxtBusy.at((idx + 1) % numRegs()));
+ }
+
+ return status;
+}
+
+uint8_t
+VectorRegisterFile::regBusy(int idx, uint32_t operandSize) const
+{
+ uint8_t status = busy.at(idx);
+
+ if (operandSize > 4) {
+ status = status | (busy.at((idx + 1) % numRegs()));
+ }
+
+ return status;
+}
+
+void
+VectorRegisterFile::preMarkReg(int regIdx, uint32_t operandSize, uint8_t value)
+{
+ nxtBusy.at(regIdx) = value;
+
+ if (operandSize > 4) {
+ nxtBusy.at((regIdx + 1) % numRegs()) = value;
+ }
+}
+
+void
+VectorRegisterFile::markReg(int regIdx, uint32_t operandSize, uint8_t value)
+{
+ busy.at(regIdx) = value;
+
+ if (operandSize > 4) {
+ busy.at((regIdx + 1) % numRegs()) = value;
+ }
+}
+
+bool
+VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
+{
+ for (int i = 0; i < ii->getNumOperands(); ++i) {
+ if (ii->isVectorRegister(i)) {
+ uint32_t vgprIdx = ii->getRegisterIndex(i);
+ uint32_t pVgpr = w->remap(vgprIdx, ii->getOperandSize(i), 1);
+
+ if (regBusy(pVgpr, ii->getOperandSize(i)) == 1) {
+ if (ii->isDstOperand(i)) {
+ w->numTimesBlockedDueWAXDependencies++;
+ } else if (ii->isSrcOperand(i)) {
+ w->numTimesBlockedDueRAWDependencies++;
+ }
+
+ return false;
+ }
+
+ if (regNxtBusy(pVgpr, ii->getOperandSize(i)) == 1) {
+ if (ii->isDstOperand(i)) {
+ w->numTimesBlockedDueWAXDependencies++;
+ } else if (ii->isSrcOperand(i)) {
+ w->numTimesBlockedDueRAWDependencies++;
+ }
+
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+void
+VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w)
+{
+ bool loadInstr = IS_OT_READ(ii->opType());
+ bool atomicInstr = IS_OT_ATOMIC(ii->opType());
+
+ bool loadNoArgInstr = loadInstr && !ii->isArgLoad();
+
+ // iterate over all register destination operands
+ for (int i = 0; i < ii->getNumOperands(); ++i) {
+ if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
+ uint32_t physReg = w->remap(ii->getRegisterIndex(i),
+ ii->getOperandSize(i), 1);
+
+ // mark the destination vector register as busy
+ markReg(physReg, ii->getOperandSize(i), 1);
+ // clear the in-flight status of the destination vector register
+ preMarkReg(physReg, ii->getOperandSize(i), 0);
+
+ // FIXME: if we ever model correct timing behavior
+ // for load argument instructions then we should not
+ // set the destination register as busy now but when
+ // the data returns. Loads and Atomics should free
+ // their destination registers when the data returns,
+ // not now
+ if (!atomicInstr && !loadNoArgInstr) {
+ uint32_t pipeLen = ii->getOperandSize(i) <= 4 ?
+ computeUnit->spBypassLength() :
+ computeUnit->dpBypassLength();
+
+ // schedule an event for marking the register as ready
+ computeUnit->registerEvent(w->simdId, physReg,
+ ii->getOperandSize(i),
+ computeUnit->shader->tick_cnt +
+ computeUnit->shader->ticks(pipeLen),
+ 0);
+ }
+ }
+ }
+}
+
+int
+VectorRegisterFile::exec(uint64_t dynamic_id, Wavefront *w,
+ std::vector<uint32_t> &regVec, uint32_t operandSize,
+ uint64_t timestamp)
+{
+ int delay = 0;
+
+ panic_if(regVec.size() <= 0, "Illegal VGPR vector size=%d\n",
+ regVec.size());
+
+ for (int i = 0; i < regVec.size(); ++i) {
+ // mark the destination VGPR as free when the timestamp expires
+ computeUnit->registerEvent(w->simdId, regVec[i], operandSize,
+ computeUnit->shader->tick_cnt + timestamp +
+ computeUnit->shader->ticks(delay), 0);
+ }
+
+ return delay;
+}
+
+void
+VectorRegisterFile::updateResources(Wavefront *w, GPUDynInstPtr ii)
+{
+ // iterate over all register destination operands
+ for (int i = 0; i < ii->getNumOperands(); ++i) {
+ if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
+ uint32_t physReg = w->remap(ii->getRegisterIndex(i),
+ ii->getOperandSize(i), 1);
+ // set the in-flight status of the destination vector register
+ preMarkReg(physReg, ii->getOperandSize(i), 1);
+ }
+ }
+}
+
+bool
+VectorRegisterFile::vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
+ GPUDynInstPtr ii,
+ VrfAccessType accessType)
+{
+ bool ready = true;
+
+ return ready;
+}
+
+bool
+VectorRegisterFile::vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
+ VrfAccessType accessType)
+{
+ bool ready = true;
+
+ return ready;
+}
+
+VectorRegisterFile*
+VectorRegisterFileParams::create()
+{
+ return new VectorRegisterFile(this);
+}
diff --git a/src/gpu-compute/vector_register_file.hh b/src/gpu-compute/vector_register_file.hh
new file mode 100644
index 000000000..1cb011a1e
--- /dev/null
+++ b/src/gpu-compute/vector_register_file.hh
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __VECTOR_REGISTER_FILE_HH__
+#define __VECTOR_REGISTER_FILE_HH__
+
+#include <list>
+
+#include "base/statistics.hh"
+#include "base/types.hh"
+#include "gpu-compute/vector_register_state.hh"
+#include "sim/sim_object.hh"
+
+class ComputeUnit;
+class Shader;
+class SimplePoolManager;
+class Wavefront;
+
+struct VectorRegisterFileParams;
+
+enum class VrfAccessType : uint8_t
+{
+ READ = 0x01,
+ WRITE = 0x02,
+ RD_WR = READ | WRITE
+};
+
+// Vector Register File
+class VectorRegisterFile : public SimObject
+{
+ public:
+ VectorRegisterFile(const VectorRegisterFileParams *p);
+
+ void setParent(ComputeUnit *_computeUnit);
+
+ // Read a register
+ template<typename T>
+ T
+ read(int regIdx, int threadId=0)
+ {
+ T p0 = vgprState->read<T>(regIdx, threadId);
+
+ return p0;
+ }
+
+ // Write a register
+ template<typename T>
+ void
+ write(int regIdx, T value, int threadId=0)
+ {
+ vgprState->write<T>(regIdx, value, threadId);
+ }
+
+ uint8_t regBusy(int idx, uint32_t operandSize) const;
+ uint8_t regNxtBusy(int idx, uint32_t operandSize) const;
+
+ int numRegs() const { return numRegsPerSimd; }
+
+ void markReg(int regIdx, uint32_t operandSize, uint8_t value);
+ void preMarkReg(int regIdx, uint32_t operandSize, uint8_t value);
+
+ virtual void exec(GPUDynInstPtr ii, Wavefront *w);
+
+ virtual int exec(uint64_t dynamic_id, Wavefront *w,
+ std::vector<uint32_t> &regVec, uint32_t operandSize,
+ uint64_t timestamp);
+
+ bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const;
+ virtual void updateEvents() { }
+ virtual void updateResources(Wavefront *w, GPUDynInstPtr ii);
+
+ virtual bool
+ isReadConflict(int memWfId, int exeWfId) const
+ {
+ return false;
+ }
+
+ virtual bool
+ isWriteConflict(int memWfId, int exeWfId) const
+ {
+ return false;
+ }
+
+ virtual bool vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
+ GPUDynInstPtr ii,
+ VrfAccessType accessType);
+
+ virtual bool vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
+ VrfAccessType accessType);
+
+ SimplePoolManager *manager;
+
+ protected:
+ ComputeUnit* computeUnit;
+ int simdId;
+
+ // flag indicating if a register is busy
+ std::vector<uint8_t> busy;
+ // flag indicating if a register will be busy (by instructions
+ // in the SIMD pipeline)
+ std::vector<uint8_t> nxtBusy;
+
+ // numer of registers (bank size) per simd unit (bank)
+ int numRegsPerSimd;
+
+ // vector register state
+ VecRegisterState *vgprState;
+};
+
+#endif // __VECTOR_REGISTER_FILE_HH__
diff --git a/src/gpu-compute/vector_register_state.cc b/src/gpu-compute/vector_register_state.cc
new file mode 100644
index 000000000..f231b0579
--- /dev/null
+++ b/src/gpu-compute/vector_register_state.cc
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/vector_register_state.hh"
+
+#include "gpu-compute/compute_unit.hh"
+
+VecRegisterState::VecRegisterState() : computeUnit(nullptr)
+{
+ s_reg.clear();
+ d_reg.clear();
+}
+
+void
+VecRegisterState::setParent(ComputeUnit *_computeUnit)
+{
+ computeUnit = _computeUnit;
+ _name = computeUnit->name() + ".VecRegState";
+}
+
+void
+VecRegisterState::init(uint32_t _size)
+{
+ s_reg.resize(_size);
+ d_reg.resize(_size);
+}
diff --git a/src/gpu-compute/vector_register_state.hh b/src/gpu-compute/vector_register_state.hh
new file mode 100644
index 000000000..a233b9acc
--- /dev/null
+++ b/src/gpu-compute/vector_register_state.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __VECTOR_REGISTER_STATE_HH__
+#define __VECTOR_REGISTER_STATE_HH__
+
+#include <array>
+#include <cassert>
+#include <string>
+#include <vector>
+
+#include "gpu-compute/misc.hh"
+
+class ComputeUnit;
+
+// Vector Register State per SIMD unit (contents of the vector
+// registers in the VRF of the SIMD)
+class VecRegisterState
+{
+ public:
+ VecRegisterState();
+ void init(uint32_t _size);
+
+ const std::string& name() const { return _name; }
+ void setParent(ComputeUnit *_computeUnit);
+ void regStats() { }
+
+ // Access methods
+ template<typename T>
+ T
+ read(int regIdx, int threadId=0) {
+ T *p0;
+ assert(sizeof(T) == 4 || sizeof(T) == 8);
+ if (sizeof(T) == 4) {
+ p0 = (T*)(&s_reg[regIdx][threadId]);
+ } else {
+ p0 = (T*)(&d_reg[regIdx][threadId]);
+ }
+
+ return *p0;
+ }
+
+ template<typename T>
+ void
+ write(unsigned int regIdx, T value, int threadId=0) {
+ T *p0;
+ assert(sizeof(T) == 4 || sizeof(T) == 8);
+ if (sizeof(T) == 4) {
+ p0 = (T*)(&s_reg[regIdx][threadId]);
+ } else {
+ p0 = (T*)(&d_reg[regIdx][threadId]);
+ }
+
+ *p0 = value;
+ }
+
+ // (Single Precision) Vector Register File size.
+ int regSize() { return s_reg.size(); }
+
+ private:
+ ComputeUnit *computeUnit;
+ std::string _name;
+ // 32-bit Single Precision Vector Register State
+ std::vector<std::array<uint32_t, VSZ>> s_reg;
+ // 64-bit Double Precision Vector Register State
+ std::vector<std::array<uint64_t, VSZ>> d_reg;
+};
+
+#endif // __VECTOR_REGISTER_STATE_HH__
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
new file mode 100644
index 000000000..0aa033db1
--- /dev/null
+++ b/src/gpu-compute/wavefront.cc
@@ -0,0 +1,925 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#include "gpu-compute/wavefront.hh"
+
+#include "debug/GPUExec.hh"
+#include "debug/WavefrontStack.hh"
+#include "gpu-compute/code_enums.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+
+Wavefront*
+WavefrontParams::create()
+{
+ return new Wavefront(this);
+}
+
+Wavefront::Wavefront(const Params *p)
+ : SimObject(p), callArgMem(nullptr)
+{
+ last_trace = 0;
+ simdId = p->simdId;
+ wfSlotId = p->wf_slot_id;
+
+ status = S_STOPPED;
+ reservedVectorRegs = 0;
+ startVgprIndex = 0;
+ outstanding_reqs = 0;
+ mem_reqs_in_pipe = 0;
+ outstanding_reqs_wr_gm = 0;
+ outstanding_reqs_wr_lm = 0;
+ outstanding_reqs_rd_gm = 0;
+ outstanding_reqs_rd_lm = 0;
+ rd_lm_reqs_in_pipe = 0;
+ rd_gm_reqs_in_pipe = 0;
+ wr_lm_reqs_in_pipe = 0;
+ wr_gm_reqs_in_pipe = 0;
+
+ barrier_cnt = 0;
+ old_barrier_cnt = 0;
+ stalledAtBarrier = false;
+
+ mem_trace_busy = 0;
+ old_vgpr_tcnt = 0xffffffffffffffffll;
+ old_dgpr_tcnt = 0xffffffffffffffffll;
+
+ pendingFetch = false;
+ dropFetch = false;
+ condRegState = new ConditionRegisterState();
+ maxSpVgprs = 0;
+ maxDpVgprs = 0;
+}
+
+void
+Wavefront::regStats()
+{
+ srcRegOpDist
+ .init(0, 4, 2)
+ .name(name() + ".src_reg_operand_dist")
+ .desc("number of executed instructions with N source register operands")
+ ;
+
+ dstRegOpDist
+ .init(0, 3, 2)
+ .name(name() + ".dst_reg_operand_dist")
+ .desc("number of executed instructions with N destination register "
+ "operands")
+ ;
+
+ // FIXME: the name of the WF needs to be unique
+ numTimesBlockedDueWAXDependencies
+ .name(name() + ".timesBlockedDueWAXDependencies")
+ .desc("number of times the wf's instructions are blocked due to WAW "
+ "or WAR dependencies")
+ ;
+
+ // FIXME: the name of the WF needs to be unique
+ numTimesBlockedDueRAWDependencies
+ .name(name() + ".timesBlockedDueRAWDependencies")
+ .desc("number of times the wf's instructions are blocked due to RAW "
+ "dependencies")
+ ;
+
+ // FIXME: the name of the WF needs to be unique
+ numTimesBlockedDueVrfPortAvail
+ .name(name() + ".timesBlockedDueVrfPortAvail")
+ .desc("number of times instructions are blocked due to VRF port "
+ "availability")
+ ;
+}
+
+void
+Wavefront::init()
+{
+ reservedVectorRegs = 0;
+ startVgprIndex = 0;
+}
+
+void
+Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
+{
+ condRegState->init(num_cregs);
+ maxSpVgprs = num_sregs;
+ maxDpVgprs = num_dregs;
+}
+
+Wavefront::~Wavefront()
+{
+ if (callArgMem)
+ delete callArgMem;
+}
+
+void
+Wavefront::start(uint64_t _wfDynId,uint64_t _base_ptr)
+{
+ wfDynId = _wfDynId;
+ base_ptr = _base_ptr;
+ status = S_RUNNING;
+}
+
+bool
+Wavefront::isGmInstruction(GPUDynInstPtr ii)
+{
+ if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
+ IS_OT_ATOMIC_PM(ii->opType())) {
+ return true;
+ }
+
+ if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
+ IS_OT_ATOMIC_GM(ii->opType())) {
+
+ return true;
+ }
+
+ if (IS_OT_FLAT(ii->opType())) {
+ return true;
+ }
+
+ return false;
+}
+
+bool
+Wavefront::isLmInstruction(GPUDynInstPtr ii)
+{
+ if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) ||
+ IS_OT_ATOMIC_LM(ii->opType())) {
+ return true;
+ }
+
+ return false;
+}
+
+bool
+Wavefront::isOldestInstALU()
+{
+ assert(!instructionBuffer.empty());
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP ||
+ ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
+ ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
+ ii->opType() == Enums::OT_KERN_READ)) {
+ return true;
+ }
+
+ return false;
+}
+
+bool
+Wavefront::isOldestInstBarrier()
+{
+ assert(!instructionBuffer.empty());
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) {
+ return true;
+ }
+
+ return false;
+}
+
+bool
+Wavefront::isOldestInstGMem()
+{
+ assert(!instructionBuffer.empty());
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) ||
+ IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
+
+ return true;
+ }
+
+ return false;
+}
+
+bool
+Wavefront::isOldestInstLMem()
+{
+ assert(!instructionBuffer.empty());
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) ||
+ IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
+
+ return true;
+ }
+
+ return false;
+}
+
+bool
+Wavefront::isOldestInstPrivMem()
+{
+ assert(!instructionBuffer.empty());
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) ||
+ IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
+
+ return true;
+ }
+
+ return false;
+}
+
+bool
+Wavefront::isOldestInstFlatMem()
+{
+ assert(!instructionBuffer.empty());
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) {
+
+ return true;
+ }
+
+ return false;
+}
+
+// Return true if the Wavefront's instruction
+// buffer has branch instruction.
+bool
+Wavefront::instructionBufferHasBranch()
+{
+ for (auto it : instructionBuffer) {
+ GPUDynInstPtr ii = it;
+
+ if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+// Remap HSAIL register to physical VGPR.
+// HSAIL register = virtual register assigned to an operand by HLC compiler
+uint32_t
+Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
+{
+ assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
+ // add the offset from where the VGPRs of the wavefront have been assigned
+ uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
+ // HSAIL double precision (DP) register: calculate the physical VGPR index
+ // assuming that DP registers are placed after SP ones in the VRF. The DP
+ // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
+ // the DP VGPR index before mapping it to the physical VRF address space
+ if (mode == 1 && size > 4) {
+ physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
+ }
+
+ assert((startVgprIndex <= physicalVgprIndex) &&
+ (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
+
+ // calculate absolute physical VGPR index
+ return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
+}
+
+// Return true if this wavefront is ready
+// to execute an instruction of the specified type.
+int
+Wavefront::ready(itype_e type)
+{
+ // Check to make sure wave is running
+ if (status == S_STOPPED || status == S_RETURNING ||
+ instructionBuffer.empty()) {
+ return 0;
+ }
+
+ // Is the wave waiting at a barrier
+ if (stalledAtBarrier) {
+ if (!computeUnit->AllAtBarrier(barrier_id,barrier_cnt,
+ computeUnit->getRefCounter(dispatchid, wg_id))) {
+ // Are all threads at barrier?
+ return 0;
+ }
+ old_barrier_cnt = barrier_cnt;
+ stalledAtBarrier = false;
+ }
+
+ // Read instruction
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ bool ready_inst M5_VAR_USED = false;
+ bool glbMemBusRdy = false;
+ bool glbMemIssueRdy = false;
+ if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
+ for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
+ if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
+ glbMemBusRdy = true;
+ if (computeUnit->wfWait[j].prerdy())
+ glbMemIssueRdy = true;
+ }
+ }
+ bool locMemBusRdy = false;
+ bool locMemIssueRdy = false;
+ if (type == I_SHARED) {
+ for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
+ if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
+ locMemBusRdy = true;
+ if (computeUnit->wfWait[j].prerdy())
+ locMemIssueRdy = true;
+ }
+ }
+
+ // The following code is very error prone and the entire process for
+ // checking readiness will be fixed eventually. In the meantime, let's
+ // make sure that we do not silently let an instruction type slip
+ // through this logic and always return not ready.
+ if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP ||
+ ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
+ ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
+ ii->opType() == Enums::OT_KERN_READ ||
+ ii->opType() == Enums::OT_ARG ||
+ IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
+ IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) ||
+ IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
+ IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
+ IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) {
+ panic("next instruction: %s is of unknown type\n", ii->disassemble());
+ }
+
+ DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
+ computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
+
+ if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) {
+ // Here for ALU instruction (barrier)
+ if (!computeUnit->wfWait[simdId].prerdy()) {
+ // Is wave slot free?
+ return 0;
+ }
+
+ // Are there in pipe or outstanding memory requests?
+ if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
+ return 0;
+ }
+
+ ready_inst = true;
+ } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) {
+ // Here for ALU instruction (nop)
+ if (!computeUnit->wfWait[simdId].prerdy()) {
+ // Is wave slot free?
+ return 0;
+ }
+
+ ready_inst = true;
+ } else if (type == I_ALU && ii->opType() == Enums::OT_RET) {
+ // Here for ALU instruction (return)
+ if (!computeUnit->wfWait[simdId].prerdy()) {
+ // Is wave slot free?
+ return 0;
+ }
+
+ // Are there in pipe or outstanding memory requests?
+ if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
+ return 0;
+ }
+
+ ready_inst = true;
+ } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH ||
+ ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
+ ii->opType() == Enums::OT_KERN_READ ||
+ ii->opType() == Enums::OT_ARG)) {
+ // Here for ALU instruction (all others)
+ if (!computeUnit->wfWait[simdId].prerdy()) {
+ // Is alu slot free?
+ return 0;
+ }
+ if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+ VrfAccessType::RD_WR)) {
+ return 0;
+ }
+
+ if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+ return 0;
+ }
+ ready_inst = true;
+ } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) ||
+ IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
+ // Here Global memory instruction
+ if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) {
+ // Are there in pipe or outstanding global memory write requests?
+ if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
+ return 0;
+ }
+ }
+
+ if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) ||
+ IS_OT_HIST_GM(ii->opType())) {
+ // Are there in pipe or outstanding global memory read requests?
+ if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0)
+ return 0;
+ }
+
+ if (!glbMemIssueRdy) {
+ // Is WV issue slot free?
+ return 0;
+ }
+
+ if (!glbMemBusRdy) {
+ // Is there an available VRF->Global memory read bus?
+ return 0;
+ }
+
+ if (!computeUnit->globalMemoryPipe.
+ isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
+ // Can we insert a new request to the Global Mem Request FIFO?
+ return 0;
+ }
+ // can we schedule source & destination operands on the VRF?
+ if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+ VrfAccessType::RD_WR)) {
+ return 0;
+ }
+ if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+ return 0;
+ }
+ ready_inst = true;
+ } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) ||
+ IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
+ // Here for Shared memory instruction
+ if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) {
+ if ((outstanding_reqs_wr_lm + wr_lm_reqs_in_pipe) > 0) {
+ return 0;
+ }
+ }
+
+ if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
+ IS_OT_HIST_LM(ii->opType())) {
+ if ((outstanding_reqs_rd_lm + rd_lm_reqs_in_pipe) > 0) {
+ return 0;
+ }
+ }
+
+ if (!locMemBusRdy) {
+ // Is there an available VRF->LDS read bus?
+ return 0;
+ }
+ if (!locMemIssueRdy) {
+ // Is wave slot free?
+ return 0;
+ }
+
+ if (!computeUnit->localMemoryPipe.
+ isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
+ // Can we insert a new request to the LDS Request FIFO?
+ return 0;
+ }
+ // can we schedule source & destination operands on the VRF?
+ if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+ VrfAccessType::RD_WR)) {
+ return 0;
+ }
+ if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+ return 0;
+ }
+ ready_inst = true;
+ } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) ||
+ IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
+ // Here for Private memory instruction ------------------------ //
+ if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) {
+ if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
+ return 0;
+ }
+ }
+
+ if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) ||
+ IS_OT_HIST_PM(ii->opType())) {
+ if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) {
+ return 0;
+ }
+ }
+
+ if (!glbMemBusRdy) {
+ // Is there an available VRF->Global memory read bus?
+ return 0;
+ }
+
+ if (!glbMemIssueRdy) {
+ // Is wave slot free?
+ return 0;
+ }
+
+ if (!computeUnit->globalMemoryPipe.
+ isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
+ // Can we insert a new request to the Global Mem Request FIFO?
+ return 0;
+ }
+ // can we schedule source & destination operands on the VRF?
+ if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+ VrfAccessType::RD_WR)) {
+ return 0;
+ }
+ if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+ return 0;
+ }
+ ready_inst = true;
+ } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) {
+ if (!glbMemBusRdy) {
+ // Is there an available VRF->Global memory read bus?
+ return 0;
+ }
+
+ if (!locMemBusRdy) {
+ // Is there an available VRF->LDS read bus?
+ return 0;
+ }
+
+ if (!glbMemIssueRdy) {
+ // Is wave slot free?
+ return 0;
+ }
+
+ if (!locMemIssueRdy) {
+ return 0;
+ }
+ if (!computeUnit->globalMemoryPipe.
+ isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
+ // Can we insert a new request to the Global Mem Request FIFO?
+ return 0;
+ }
+
+ if (!computeUnit->localMemoryPipe.
+ isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
+ // Can we insert a new request to the LDS Request FIFO?
+ return 0;
+ }
+ // can we schedule source & destination operands on the VRF?
+ if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+ VrfAccessType::RD_WR)) {
+ return 0;
+ }
+ // are all the operands ready? (RAW, WAW and WAR depedencies met?)
+ if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+ return 0;
+ }
+ ready_inst = true;
+ } else {
+ return 0;
+ }
+
+ assert(ready_inst);
+
+ DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
+ simdId, wfSlotId, ii->disassemble());
+
+ return 1;
+}
+
+void
+Wavefront::updateResources()
+{
+ // Get current instruction
+ GPUDynInstPtr ii = instructionBuffer.front();
+ assert(ii);
+ computeUnit->vrf[simdId]->updateResources(this, ii);
+ // Single precision ALU or Branch or Return or Special instruction
+ if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
+ ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
+ // FIXME: Kernel argument loads are currently treated as ALU operations
+ // since we don't send memory packets at execution. If we fix that then
+ // we should map them to one of the memory pipelines
+ ii->opType()==Enums::OT_KERN_READ ||
+ ii->opType()==Enums::OT_ARG ||
+ ii->opType()==Enums::OT_RET) {
+ computeUnit->aluPipe[simdId].preset(computeUnit->shader->
+ ticks(computeUnit->spBypassLength()));
+ // this is to enforce a fixed number of cycles per issue slot per SIMD
+ computeUnit->wfWait[simdId].preset(computeUnit->shader->
+ ticks(computeUnit->issuePeriod));
+ } else if (ii->opType() == Enums::OT_BARRIER) {
+ computeUnit->wfWait[simdId].preset(computeUnit->shader->
+ ticks(computeUnit->issuePeriod));
+ } else if (ii->opType() == Enums::OT_FLAT_READ) {
+ assert(Enums::SC_NONE != ii->executedAs());
+ mem_reqs_in_pipe++;
+ rd_gm_reqs_in_pipe++;
+ if ( Enums::SC_SHARED == ii->executedAs() ) {
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ preset(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else {
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ preset(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ }
+ } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
+ assert(Enums::SC_NONE != ii->executedAs());
+ mem_reqs_in_pipe++;
+ wr_gm_reqs_in_pipe++;
+ if (Enums::SC_SHARED == ii->executedAs()) {
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ preset(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else {
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ preset(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ }
+ } else if (IS_OT_READ_GM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ rd_gm_reqs_in_pipe++;
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ preset(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_WRITE_GM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ wr_gm_reqs_in_pipe++;
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ preset(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_ATOMIC_GM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ wr_gm_reqs_in_pipe++;
+ rd_gm_reqs_in_pipe++;
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ preset(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_READ_LM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ rd_lm_reqs_in_pipe++;
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ preset(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_WRITE_LM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ wr_lm_reqs_in_pipe++;
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ preset(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_ATOMIC_LM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ wr_lm_reqs_in_pipe++;
+ rd_lm_reqs_in_pipe++;
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ preset(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_READ_PM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ rd_gm_reqs_in_pipe++;
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ preset(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_WRITE_PM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ wr_gm_reqs_in_pipe++;
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ preset(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_ATOMIC_PM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ wr_gm_reqs_in_pipe++;
+ rd_gm_reqs_in_pipe++;
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ preset(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ }
+}
+
+void
+Wavefront::exec()
+{
+ // ---- Exit if wavefront is inactive ----------------------------- //
+
+ if (status == S_STOPPED || status == S_RETURNING ||
+ instructionBuffer.empty()) {
+ return;
+ }
+
+ // Get current instruction
+
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ const uint32_t old_pc = pc();
+ DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
+ "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
+ ii->disassemble(), old_pc);
+ ii->execute();
+ // access the VRF
+ computeUnit->vrf[simdId]->exec(ii, this);
+ srcRegOpDist.sample(ii->numSrcRegOperands());
+ dstRegOpDist.sample(ii->numDstRegOperands());
+ computeUnit->numInstrExecuted++;
+ computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
+ computeUnit->lastExecCycle[simdId]);
+ computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
+ if (pc() == old_pc) {
+ uint32_t new_pc = old_pc + 1;
+ // PC not modified by instruction, proceed to next or pop frame
+ pc(new_pc);
+ if (new_pc == rpc()) {
+ popFromReconvergenceStack();
+ discardFetch();
+ } else {
+ instructionBuffer.pop_front();
+ }
+ }
+
+ if (computeUnit->shader->hsail_mode==Shader::SIMT) {
+ const int num_active_lanes = execMask().count();
+ computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
+ computeUnit->numVecOpsExecuted += num_active_lanes;
+ if (isGmInstruction(ii)) {
+ computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
+ } else if (isLmInstruction(ii)) {
+ computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
+ }
+ }
+
+ // ---- Update Vector ALU pipeline and other resources ------------------ //
+ // Single precision ALU or Branch or Return or Special instruction
+ if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
+ ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
+ // FIXME: Kernel argument loads are currently treated as ALU operations
+ // since we don't send memory packets at execution. If we fix that then
+ // we should map them to one of the memory pipelines
+ ii->opType() == Enums::OT_KERN_READ ||
+ ii->opType() == Enums::OT_ARG ||
+ ii->opType() == Enums::OT_RET) {
+ computeUnit->aluPipe[simdId].set(computeUnit->shader->
+ ticks(computeUnit->spBypassLength()));
+
+ // this is to enforce a fixed number of cycles per issue slot per SIMD
+ computeUnit->wfWait[simdId].set(computeUnit->shader->
+ ticks(computeUnit->issuePeriod));
+ } else if (ii->opType() == Enums::OT_BARRIER) {
+ computeUnit->wfWait[simdId].set(computeUnit->shader->
+ ticks(computeUnit->issuePeriod));
+ } else if (ii->opType() == Enums::OT_FLAT_READ) {
+ assert(Enums::SC_NONE != ii->executedAs());
+
+ if (Enums::SC_SHARED == ii->executedAs()) {
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ set(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else {
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ set(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ }
+ } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
+ assert(Enums::SC_NONE != ii->executedAs());
+ if (Enums::SC_SHARED == ii->executedAs()) {
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ set(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else {
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ set(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ }
+ } else if (IS_OT_READ_GM(ii->opType())) {
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ set(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_WRITE_GM(ii->opType())) {
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ set(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_ATOMIC_GM(ii->opType())) {
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ set(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_READ_LM(ii->opType())) {
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ set(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_WRITE_LM(ii->opType())) {
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ set(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_ATOMIC_LM(ii->opType())) {
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ set(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ }
+}
+
+bool
+Wavefront::waitingAtBarrier(int lane)
+{
+ return bar_cnt[lane] < max_bar_cnt;
+}
+
+void
+Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
+ const VectorMask& mask)
+{
+ assert(mask.count());
+ reconvergenceStack.emplace(new ReconvergenceStackEntry(pc, rpc, mask));
+}
+
+void
+Wavefront::popFromReconvergenceStack()
+{
+ assert(!reconvergenceStack.empty());
+
+ DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
+ computeUnit->cu_id, simdId, wfSlotId, wfDynId,
+ execMask().to_string<char, std::string::traits_type,
+ std::string::allocator_type>().c_str(), pc());
+
+ reconvergenceStack.pop();
+
+ DPRINTF(WavefrontStack, "%3i %s\n", pc(),
+ execMask().to_string<char, std::string::traits_type,
+ std::string::allocator_type>().c_str());
+
+}
+
+void
+Wavefront::discardFetch()
+{
+ instructionBuffer.clear();
+ dropFetch |=pendingFetch;
+}
+
+uint32_t
+Wavefront::pc() const
+{
+ return reconvergenceStack.top()->pc;
+}
+
+uint32_t
+Wavefront::rpc() const
+{
+ return reconvergenceStack.top()->rpc;
+}
+
+VectorMask
+Wavefront::execMask() const
+{
+ return reconvergenceStack.top()->execMask;
+}
+
+bool
+Wavefront::execMask(int lane) const
+{
+ return reconvergenceStack.top()->execMask[lane];
+}
+
+
+void
+Wavefront::pc(uint32_t new_pc)
+{
+ reconvergenceStack.top()->pc = new_pc;
+}
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
new file mode 100644
index 000000000..0abab8e83
--- /dev/null
+++ b/src/gpu-compute/wavefront.hh
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#ifndef __WAVEFRONT_HH__
+#define __WAVEFRONT_HH__
+
+#include <cassert>
+#include <deque>
+#include <memory>
+#include <stack>
+#include <vector>
+
+#include "base/misc.hh"
+#include "base/types.hh"
+#include "gpu-compute/condition_register_state.hh"
+#include "gpu-compute/lds_state.hh"
+#include "gpu-compute/misc.hh"
+#include "params/Wavefront.hh"
+#include "sim/sim_object.hh"
+
+static const int MAX_NUM_INSTS_PER_WF = 12;
+
+/*
+ * Arguments for the hsail opcode call, are user defined and variable length.
+ * The hardware/finalizer can support arguments in hardware or use memory to
+ * pass arguments. For now, let's assume that an unlimited number of arguments
+ * are supported in hardware (the compiler inlines functions whenver it can
+ * anyways, so unless someone is interested in the implications of linking/
+ * library functions, I think this is a reasonable assumption given the typical
+ * size of an OpenCL kernel).
+ *
+ * Note that call args are different than kernel arguments:
+ * * All work-items in a kernel refer the same set of kernel arguments
+ * * Each work-item has it's on set of call args. So a call argument at
+ * address 0x4 is different for work-item 0 and work-item 1.
+ *
+ * Ok, the table below shows an example of how we organize the call arguments in
+ * the CallArgMem class.
+ *
+ * int foo(int arg1, double arg2)
+ * ___________________________________________________
+ * | 0: return.0 | 4: return.1 | ... | 252: return.63 |
+ * |---------------------------------------------------|
+ * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 |
+ * |---------------------------------------------------|
+ * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 |
+ * ___________________________________________________
+ */
+class CallArgMem
+{
+ public:
+ // pointer to buffer for storing function arguments
+ uint8_t *mem;
+ // size of function args
+ int funcArgsSizePerItem;
+
+ template<typename CType>
+ int
+ getLaneOffset(int lane, int addr)
+ {
+ return addr * VSZ + sizeof(CType) * lane;
+ }
+
+ CallArgMem(int func_args_size_per_item)
+ : funcArgsSizePerItem(func_args_size_per_item)
+ {
+ mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ);
+ }
+
+ ~CallArgMem()
+ {
+ free(mem);
+ }
+
+ template<typename CType>
+ uint8_t*
+ getLaneAddr(int lane, int addr)
+ {
+ return mem + getLaneOffset<CType>(lane, addr);
+ }
+
+ template<typename CType>
+ void
+ setLaneAddr(int lane, int addr, CType val)
+ {
+ *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
+ }
+};
+
+/**
+ * A reconvergence stack entry conveys the necessary state to implement
+ * control flow divergence.
+ */
+class ReconvergenceStackEntry {
+
+ public:
+ ReconvergenceStackEntry(uint32_t new_pc, uint32_t new_rpc,
+ VectorMask new_mask) : pc(new_pc), rpc(new_rpc),
+ execMask(new_mask) {
+ }
+
+ /**
+ * PC of current instruction.
+ */
+ uint32_t pc;
+ /**
+ * PC of the immediate post-dominator instruction, i.e., the value of
+ * @a pc for the first instruction that will be executed by the wavefront
+ * when a reconvergence point is reached.
+ */
+ uint32_t rpc;
+ /**
+ * Execution mask.
+ */
+ VectorMask execMask;
+};
+
+class Wavefront : public SimObject
+{
+ public:
+ enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
+ enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
+
+ // Base pointer for array of instruction pointers
+ uint64_t base_ptr;
+
+ uint32_t old_barrier_cnt;
+ uint32_t barrier_cnt;
+ uint32_t barrier_id;
+ uint32_t barrier_slots;
+ status_e status;
+ // HW slot id where the WF is mapped to inside a SIMD unit
+ int wfSlotId;
+ int kern_id;
+ // SIMD unit where the WV has been scheduled
+ int simdId;
+ // pointer to parent CU
+ ComputeUnit *computeUnit;
+
+ std::deque<GPUDynInstPtr> instructionBuffer;
+
+ bool pendingFetch;
+ bool dropFetch;
+
+ // Condition Register State (for HSAIL simulations only)
+ class ConditionRegisterState *condRegState;
+ // number of single precision VGPRs required by WF
+ uint32_t maxSpVgprs;
+ // number of double precision VGPRs required by WF
+ uint32_t maxDpVgprs;
+ // map virtual to physical vector register
+ uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
+ void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+ bool isGmInstruction(GPUDynInstPtr ii);
+ bool isLmInstruction(GPUDynInstPtr ii);
+ bool isOldestInstGMem();
+ bool isOldestInstLMem();
+ bool isOldestInstPrivMem();
+ bool isOldestInstFlatMem();
+ bool isOldestInstALU();
+ bool isOldestInstBarrier();
+ // used for passing spill address to DDInstGPU
+ uint64_t last_addr[VSZ];
+ uint32_t workitemid[3][VSZ];
+ uint32_t workitemFlatId[VSZ];
+ uint32_t workgroupid[3];
+ uint32_t workgroupsz[3];
+ uint32_t gridsz[3];
+ uint32_t wg_id;
+ uint32_t wg_sz;
+ uint32_t dynwaveid;
+ uint32_t maxdynwaveid;
+ uint32_t dispatchid;
+ // outstanding global+local memory requests
+ uint32_t outstanding_reqs;
+ // memory requests between scoreboard
+ // and execute stage not yet executed
+ uint32_t mem_reqs_in_pipe;
+ // outstanding global memory write requests
+ uint32_t outstanding_reqs_wr_gm;
+ // outstanding local memory write requests
+ uint32_t outstanding_reqs_wr_lm;
+ // outstanding global memory read requests
+ uint32_t outstanding_reqs_rd_gm;
+ // outstanding local memory read requests
+ uint32_t outstanding_reqs_rd_lm;
+ uint32_t rd_lm_reqs_in_pipe;
+ uint32_t rd_gm_reqs_in_pipe;
+ uint32_t wr_lm_reqs_in_pipe;
+ uint32_t wr_gm_reqs_in_pipe;
+
+ int mem_trace_busy;
+ uint64_t last_trace;
+ // number of vector registers reserved by WF
+ int reservedVectorRegs;
+ // Index into the Vector Register File's namespace where the WF's registers
+ // will live while the WF is executed
+ uint32_t startVgprIndex;
+
+ // Old value of destination gpr (for trace)
+ uint32_t old_vgpr[VSZ];
+ // Id of destination gpr (for trace)
+ uint32_t old_vgpr_id;
+ // Tick count of last old_vgpr copy
+ uint64_t old_vgpr_tcnt;
+
+ // Old value of destination gpr (for trace)
+ uint64_t old_dgpr[VSZ];
+ // Id of destination gpr (for trace)
+ uint32_t old_dgpr_id;
+ // Tick count of last old_vgpr copy
+ uint64_t old_dgpr_tcnt;
+
+ // Execution mask at wavefront start
+ VectorMask init_mask;
+
+ // number of barriers this WF has joined
+ int bar_cnt[VSZ];
+ int max_bar_cnt;
+ // Flag to stall a wave on barrier
+ bool stalledAtBarrier;
+
+ // a pointer to the fraction of the LDS allocated
+ // to this workgroup (thus this wavefront)
+ LdsChunk *ldsChunk;
+
+ // A pointer to the spill area
+ Addr spillBase;
+ // The size of the spill area
+ uint32_t spillSizePerItem;
+ // The vector width of the spill area
+ uint32_t spillWidth;
+
+ // A pointer to the private memory area
+ Addr privBase;
+ // The size of the private memory area
+ uint32_t privSizePerItem;
+
+ // A pointer ot the read-only memory area
+ Addr roBase;
+ // size of the read-only memory area
+ uint32_t roSize;
+
+ // pointer to buffer for storing kernel arguments
+ uint8_t *kernelArgs;
+ // unique WF id over all WFs executed across all CUs
+ uint64_t wfDynId;
+
+ // number of times instruction issue for this wavefront is blocked
+ // due to VRF port availability
+ Stats::Scalar numTimesBlockedDueVrfPortAvail;
+ // number of times an instruction of a WF is blocked from being issued
+ // due to WAR and WAW dependencies
+ Stats::Scalar numTimesBlockedDueWAXDependencies;
+ // number of times an instruction of a WF is blocked from being issued
+ // due to WAR and WAW dependencies
+ Stats::Scalar numTimesBlockedDueRAWDependencies;
+ // distribution of executed instructions based on their register
+ // operands; this is used to highlight the load on the VRF
+ Stats::Distribution srcRegOpDist;
+ Stats::Distribution dstRegOpDist;
+
+ // Functions to operate on call argument memory
+ // argument memory for hsail call instruction
+ CallArgMem *callArgMem;
+ void
+ initCallArgMem(int func_args_size_per_item)
+ {
+ callArgMem = new CallArgMem(func_args_size_per_item);
+ }
+
+ template<typename CType>
+ CType
+ readCallArgMem(int lane, int addr)
+ {
+ return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
+ }
+
+ template<typename CType>
+ void
+ writeCallArgMem(int lane, int addr, CType val)
+ {
+ callArgMem->setLaneAddr<CType>(lane, addr, val);
+ }
+
+ typedef WavefrontParams Params;
+ Wavefront(const Params *p);
+ ~Wavefront();
+ virtual void init();
+
+ void
+ setParent(ComputeUnit *cu)
+ {
+ computeUnit = cu;
+ }
+
+ void start(uint64_t _wfDynId, uint64_t _base_ptr);
+
+ void exec();
+ void updateResources();
+ int ready(itype_e type);
+ bool instructionBufferHasBranch();
+ void regStats();
+ VectorMask get_pred() { return execMask() & init_mask; }
+
+ bool waitingAtBarrier(int lane);
+
+ void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
+ const VectorMask& exec_mask);
+
+ void popFromReconvergenceStack();
+
+ uint32_t pc() const;
+
+ uint32_t rpc() const;
+
+ VectorMask execMask() const;
+
+ bool execMask(int lane) const;
+
+ void pc(uint32_t new_pc);
+
+ void discardFetch();
+
+ private:
+ /**
+ * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
+ * to be visited by the wavefront, and the associated execution masks. The
+ * reconvergence stack grows every time the wavefront reaches a divergence
+ * point (branch instruction), and shrinks every time the wavefront
+ * reaches a reconvergence point (immediate post-dominator instruction).
+ */
+ std::stack<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
+};
+
+#endif // __WAVEFRONT_HH__