From 1a7d3f9fcb76a68540dd948f91413533a383bfde Mon Sep 17 00:00:00 2001 From: Tony Gutierrez Date: Tue, 19 Jan 2016 14:28:22 -0500 Subject: gpu-compute: AMD's baseline GPU model --- src/gpu-compute/GPU.py | 310 +++++ src/gpu-compute/LdsState.py | 51 + src/gpu-compute/SConscript | 99 ++ src/gpu-compute/X86GPUTLB.py | 77 ++ src/gpu-compute/brig_object.cc | 474 +++++++ src/gpu-compute/brig_object.hh | 134 ++ src/gpu-compute/cl_driver.cc | 272 ++++ src/gpu-compute/cl_driver.hh | 77 ++ src/gpu-compute/cl_event.hh | 51 + src/gpu-compute/code_enums.hh | 116 ++ src/gpu-compute/compute_unit.cc | 1817 +++++++++++++++++++++++++++ src/gpu-compute/compute_unit.hh | 767 +++++++++++ src/gpu-compute/condition_register_state.cc | 83 ++ src/gpu-compute/condition_register_state.hh | 101 ++ src/gpu-compute/dispatcher.cc | 394 ++++++ src/gpu-compute/dispatcher.hh | 163 +++ src/gpu-compute/exec_stage.cc | 203 +++ src/gpu-compute/exec_stage.hh | 129 ++ src/gpu-compute/fetch_stage.cc | 106 ++ src/gpu-compute/fetch_stage.hh | 78 ++ src/gpu-compute/fetch_unit.cc | 293 +++++ src/gpu-compute/fetch_unit.hh | 89 ++ src/gpu-compute/global_memory_pipeline.cc | 242 ++++ src/gpu-compute/global_memory_pipeline.hh | 123 ++ src/gpu-compute/gpu_dyn_inst.cc | 198 +++ src/gpu-compute/gpu_dyn_inst.hh | 464 +++++++ src/gpu-compute/gpu_exec_context.cc | 53 + src/gpu-compute/gpu_exec_context.hh | 54 + src/gpu-compute/gpu_static_inst.cc | 42 + src/gpu-compute/gpu_static_inst.hh | 166 +++ src/gpu-compute/gpu_tlb.cc | 1801 ++++++++++++++++++++++++++ src/gpu-compute/gpu_tlb.hh | 465 +++++++ src/gpu-compute/hsa_code.hh | 101 ++ src/gpu-compute/hsa_kernel_info.hh | 79 ++ src/gpu-compute/hsa_object.cc | 76 ++ src/gpu-compute/hsa_object.hh | 74 ++ src/gpu-compute/hsail_code.cc | 453 +++++++ src/gpu-compute/hsail_code.hh | 447 +++++++ src/gpu-compute/kernel_cfg.cc | 296 +++++ src/gpu-compute/kernel_cfg.hh | 133 ++ src/gpu-compute/lds_state.cc | 341 +++++ src/gpu-compute/lds_state.hh | 512 ++++++++ src/gpu-compute/local_memory_pipeline.cc | 200 +++ src/gpu-compute/local_memory_pipeline.hh | 98 ++ src/gpu-compute/misc.hh | 162 +++ src/gpu-compute/ndrange.hh | 70 ++ src/gpu-compute/of_scheduling_policy.cc | 76 ++ src/gpu-compute/of_scheduling_policy.hh | 61 + src/gpu-compute/pool_manager.cc | 42 + src/gpu-compute/pool_manager.hh | 66 + src/gpu-compute/qstruct.hh | 201 +++ src/gpu-compute/rr_scheduling_policy.cc | 67 + src/gpu-compute/rr_scheduling_policy.hh | 65 + src/gpu-compute/schedule_stage.cc | 151 +++ src/gpu-compute/schedule_stage.hh | 95 ++ src/gpu-compute/scheduler.cc | 71 ++ src/gpu-compute/scheduler.hh | 63 + src/gpu-compute/scheduling_policy.hh | 57 + src/gpu-compute/scoreboard_check_stage.cc | 173 +++ src/gpu-compute/scoreboard_check_stage.hh | 106 ++ src/gpu-compute/shader.cc | 412 ++++++ src/gpu-compute/shader.hh | 212 ++++ src/gpu-compute/simple_pool_manager.cc | 108 ++ src/gpu-compute/simple_pool_manager.hh | 72 ++ src/gpu-compute/tlb_coalescer.cc | 583 +++++++++ src/gpu-compute/tlb_coalescer.hh | 252 ++++ src/gpu-compute/vector_register_file.cc | 251 ++++ src/gpu-compute/vector_register_file.hh | 142 +++ src/gpu-compute/vector_register_state.cc | 58 + src/gpu-compute/vector_register_state.hh | 101 ++ src/gpu-compute/wavefront.cc | 925 ++++++++++++++ src/gpu-compute/wavefront.hh | 368 ++++++ 72 files changed, 17312 insertions(+) create mode 100644 src/gpu-compute/GPU.py create mode 100644 src/gpu-compute/LdsState.py create mode 100644 src/gpu-compute/SConscript create mode 100644 src/gpu-compute/X86GPUTLB.py create mode 100644 src/gpu-compute/brig_object.cc create mode 100644 src/gpu-compute/brig_object.hh create mode 100644 src/gpu-compute/cl_driver.cc create mode 100644 src/gpu-compute/cl_driver.hh create mode 100644 src/gpu-compute/cl_event.hh create mode 100644 src/gpu-compute/code_enums.hh create mode 100644 src/gpu-compute/compute_unit.cc create mode 100644 src/gpu-compute/compute_unit.hh create mode 100644 src/gpu-compute/condition_register_state.cc create mode 100644 src/gpu-compute/condition_register_state.hh create mode 100644 src/gpu-compute/dispatcher.cc create mode 100644 src/gpu-compute/dispatcher.hh create mode 100644 src/gpu-compute/exec_stage.cc create mode 100644 src/gpu-compute/exec_stage.hh create mode 100644 src/gpu-compute/fetch_stage.cc create mode 100644 src/gpu-compute/fetch_stage.hh create mode 100644 src/gpu-compute/fetch_unit.cc create mode 100644 src/gpu-compute/fetch_unit.hh create mode 100644 src/gpu-compute/global_memory_pipeline.cc create mode 100644 src/gpu-compute/global_memory_pipeline.hh create mode 100644 src/gpu-compute/gpu_dyn_inst.cc create mode 100644 src/gpu-compute/gpu_dyn_inst.hh create mode 100644 src/gpu-compute/gpu_exec_context.cc create mode 100644 src/gpu-compute/gpu_exec_context.hh create mode 100644 src/gpu-compute/gpu_static_inst.cc create mode 100644 src/gpu-compute/gpu_static_inst.hh create mode 100644 src/gpu-compute/gpu_tlb.cc create mode 100644 src/gpu-compute/gpu_tlb.hh create mode 100644 src/gpu-compute/hsa_code.hh create mode 100644 src/gpu-compute/hsa_kernel_info.hh create mode 100644 src/gpu-compute/hsa_object.cc create mode 100644 src/gpu-compute/hsa_object.hh create mode 100644 src/gpu-compute/hsail_code.cc create mode 100644 src/gpu-compute/hsail_code.hh create mode 100644 src/gpu-compute/kernel_cfg.cc create mode 100644 src/gpu-compute/kernel_cfg.hh create mode 100644 src/gpu-compute/lds_state.cc create mode 100644 src/gpu-compute/lds_state.hh create mode 100644 src/gpu-compute/local_memory_pipeline.cc create mode 100644 src/gpu-compute/local_memory_pipeline.hh create mode 100644 src/gpu-compute/misc.hh create mode 100644 src/gpu-compute/ndrange.hh create mode 100644 src/gpu-compute/of_scheduling_policy.cc create mode 100644 src/gpu-compute/of_scheduling_policy.hh create mode 100644 src/gpu-compute/pool_manager.cc create mode 100644 src/gpu-compute/pool_manager.hh create mode 100644 src/gpu-compute/qstruct.hh create mode 100644 src/gpu-compute/rr_scheduling_policy.cc create mode 100644 src/gpu-compute/rr_scheduling_policy.hh create mode 100644 src/gpu-compute/schedule_stage.cc create mode 100644 src/gpu-compute/schedule_stage.hh create mode 100644 src/gpu-compute/scheduler.cc create mode 100644 src/gpu-compute/scheduler.hh create mode 100644 src/gpu-compute/scheduling_policy.hh create mode 100644 src/gpu-compute/scoreboard_check_stage.cc create mode 100644 src/gpu-compute/scoreboard_check_stage.hh create mode 100644 src/gpu-compute/shader.cc create mode 100644 src/gpu-compute/shader.hh create mode 100644 src/gpu-compute/simple_pool_manager.cc create mode 100644 src/gpu-compute/simple_pool_manager.hh create mode 100644 src/gpu-compute/tlb_coalescer.cc create mode 100644 src/gpu-compute/tlb_coalescer.hh create mode 100644 src/gpu-compute/vector_register_file.cc create mode 100644 src/gpu-compute/vector_register_file.hh create mode 100644 src/gpu-compute/vector_register_state.cc create mode 100644 src/gpu-compute/vector_register_state.hh create mode 100644 src/gpu-compute/wavefront.cc create mode 100644 src/gpu-compute/wavefront.hh (limited to 'src/gpu-compute') diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py new file mode 100644 index 000000000..bd95f6335 --- /dev/null +++ b/src/gpu-compute/GPU.py @@ -0,0 +1,310 @@ +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Steve Reinhardt +# + +from ClockedObject import ClockedObject +from Device import DmaDevice +from m5.defines import buildEnv +from m5.params import * +from m5.proxy import * +from m5.SimObject import SimObject +from MemObject import MemObject +from Process import EmulatedDriver +from Bridge import Bridge +from LdsState import LdsState + +class PrefetchType(Enum): vals = [ + 'PF_CU', + 'PF_PHASE', + 'PF_WF', + 'PF_STRIDE', + 'PF_END', + ] + +class VectorRegisterFile(SimObject): + type = 'VectorRegisterFile' + cxx_class = 'VectorRegisterFile' + cxx_header = 'gpu-compute/vector_register_file.hh' + + simd_id = Param.Int(0, 'SIMD ID associated with this VRF') + num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD') + min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF') + +class Wavefront(SimObject): + type = 'Wavefront' + cxx_class = 'Wavefront' + cxx_header = 'gpu-compute/wavefront.hh' + + simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)') + wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)') + +class ComputeUnit(MemObject): + type = 'ComputeUnit' + cxx_class = 'ComputeUnit' + cxx_header = 'gpu-compute/compute_unit.hh' + + wavefronts = VectorParam.Wavefront('Number of wavefronts') + wfSize = Param.Int(64, 'Wavefront size (in work items)') + num_SIMDs = Param.Int(4, 'number of SIMD units per CU') + + spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\ + 'latency') + + dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\ + 'latency') + + issue_period = Param.Int(4, 'number of cycles per issue period') + num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU') + num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU') + n_wf = Param.Int(1, 'Number of wavefront slots per SIMD') + mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\ + "Represents the pipeline to reach the TCP and "\ + "specified in GPU clock cycles") + mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\ + "cu. Represents the pipeline between the TCP "\ + "and cu as well as TCP data array access. "\ + "Specified in GPU clock cycles") + system = Param.System(Parent.any, "system object") + cu_id = Param.Int('CU id') + vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\ + "in bytes") + coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\ + "in bytes") + + memory_port = VectorMasterPort("Port to the memory system") + translation_port = VectorMasterPort('Port to the TLB hierarchy') + sqc_port = MasterPort("Port to the SQC (I-cache") + sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)") + perLaneTLB = Param.Bool(False, "enable per-lane TLB") + prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\ + "(0 turns off prefetching)") + prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)") + prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\ + "from last mem req in lane of "\ + "CU|Phase|Wavefront") + execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy"); + xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr."); + debugSegFault = Param.Bool(False, "enable debugging GPU seg faults") + functionalTLB = Param.Bool(False, "Assume TLB causes no delay") + + localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\ + "kernel end") + + countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\ + "and how many times") + global_mem_queue_size = Param.Int(256, "Number of entries in the global " + "memory pipeline's queues") + local_mem_queue_size = Param.Int(256, "Number of entries in the local " + "memory pipeline's queues") + ldsBus = Bridge() # the bridge between the CU and its LDS + ldsPort = MasterPort("The port that goes to the LDS") + localDataStore = Param.LdsState("the LDS for this CU") + + vector_register_file = VectorParam.VectorRegisterFile("Vector register "\ + "file") + +class Shader(ClockedObject): + type = 'Shader' + cxx_class = 'Shader' + cxx_header = 'gpu-compute/shader.hh' + + CUs = VectorParam.ComputeUnit('Number of compute units') + n_wf = Param.Int(1, 'Number of wavefront slots per SIMD') + impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into + ruby at kernel boundaries""") + separate_acquire_release = Param.Bool(False, + """Do ld_acquire/st_release generate separate requests for the + acquire and release?""") + globalmem = Param.MemorySize('64kB', 'Memory size') + timing = Param.Bool(False, 'timing memory accesses') + + cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU") + translation = Param.Bool(False, "address translation"); + +class ClDriver(EmulatedDriver): + type = 'ClDriver' + cxx_header = 'gpu-compute/cl_driver.hh' + codefile = VectorParam.String('code file name(s)') + +class GpuDispatcher(DmaDevice): + type = 'GpuDispatcher' + cxx_header = 'gpu-compute/dispatcher.hh' + # put at 8GB line for now + pio_addr = Param.Addr(0x200000000, "Device Address") + pio_latency = Param.Latency('1ns', "Programmed IO latency") + shader_pointer = Param.Shader('pointer to shader') + translation_port = MasterPort('Port to the dispatcher TLB') + cpu = Param.BaseCPU("CPU to wake up on kernel completion") + + cl_driver = Param.ClDriver('pointer to driver') + +class OpType(Enum): vals = [ + 'OT_NULL', + 'OT_ALU', + 'OT_SPECIAL', + 'OT_GLOBAL_READ', + 'OT_GLOBAL_WRITE', + 'OT_GLOBAL_ATOMIC', + 'OT_GLOBAL_HIST', + 'OT_GLOBAL_LDAS', + 'OT_SHARED_READ', + 'OT_SHARED_WRITE', + 'OT_SHARED_ATOMIC', + 'OT_SHARED_HIST', + 'OT_SHARED_LDAS', + 'OT_PRIVATE_READ', + 'OT_PRIVATE_WRITE', + 'OT_PRIVATE_ATOMIC', + 'OT_PRIVATE_HIST', + 'OT_PRIVATE_LDAS', + 'OT_SPILL_READ', + 'OT_SPILL_WRITE', + 'OT_SPILL_ATOMIC', + 'OT_SPILL_HIST', + 'OT_SPILL_LDAS', + 'OT_READONLY_READ', + 'OT_READONLY_WRITE', + 'OT_READONLY_ATOMIC', + 'OT_READONLY_HIST', + 'OT_READONLY_LDAS', + 'OT_FLAT_READ', + 'OT_FLAT_WRITE', + 'OT_FLAT_ATOMIC', + 'OT_FLAT_HIST', + 'OT_FLAT_LDAS', + 'OT_KERN_READ', + 'OT_BRANCH', + + # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version + # of the compiler. + 'OT_SHARED_MEMFENCE', + 'OT_GLOBAL_MEMFENCE', + 'OT_BOTH_MEMFENCE', + + 'OT_BARRIER', + 'OT_PRINT', + 'OT_RET', + 'OT_NOP', + 'OT_ARG' + ] + +class MemType(Enum): vals = [ + 'M_U8', + 'M_U16', + 'M_U32', + 'M_U64', + 'M_S8', + 'M_S16', + 'M_S32', + 'M_S64', + 'M_F16', + 'M_F32', + 'M_F64', + ] + +class MemOpType(Enum): vals = [ + 'MO_LD', + 'MO_ST', + 'MO_LDAS', + 'MO_LDA', + 'MO_AAND', + 'MO_AOR', + 'MO_AXOR', + 'MO_ACAS', + 'MO_AEXCH', + 'MO_AADD', + 'MO_ASUB', + 'MO_AINC', + 'MO_ADEC', + 'MO_AMAX', + 'MO_AMIN', + 'MO_ANRAND', + 'MO_ANROR', + 'MO_ANRXOR', + 'MO_ANRCAS', + 'MO_ANREXCH', + 'MO_ANRADD', + 'MO_ANRSUB', + 'MO_ANRINC', + 'MO_ANRDEC', + 'MO_ANRMAX', + 'MO_ANRMIN', + 'MO_HAND', + 'MO_HOR', + 'MO_HXOR', + 'MO_HCAS', + 'MO_HEXCH', + 'MO_HADD', + 'MO_HSUB', + 'MO_HINC', + 'MO_HDEC', + 'MO_HMAX', + 'MO_HMIN', + 'MO_UNDEF' + ] + +class StorageClassType(Enum): vals = [ + 'SC_SPILL', + 'SC_GLOBAL', + 'SC_SHARED', + 'SC_PRIVATE', + 'SC_READONLY', + 'SC_KERNARG', + 'SC_NONE', + ] + +class RegisterType(Enum): vals = [ + 'RT_VECTOR', + 'RT_SCALAR', + 'RT_CONDITION', + 'RT_HARDWARE', + 'RT_NONE', + ] + +class GenericMemoryOrder(Enum): vals = [ + 'MEMORY_ORDER_NONE', + 'MEMORY_ORDER_RELAXED', + 'MEMORY_ORDER_SC_ACQUIRE', + 'MEMORY_ORDER_SC_RELEASE', + 'MEMORY_ORDER_SC_ACQUIRE_RELEASE', + ] + +class GenericMemoryScope(Enum): vals = [ + 'MEMORY_SCOPE_NONE', + 'MEMORY_SCOPE_WORKITEM', + 'MEMORY_SCOPE_WAVEFRONT', + 'MEMORY_SCOPE_WORKGROUP', + 'MEMORY_SCOPE_DEVICE', + 'MEMORY_SCOPE_SYSTEM', + ] diff --git a/src/gpu-compute/LdsState.py b/src/gpu-compute/LdsState.py new file mode 100644 index 000000000..6ea9f6427 --- /dev/null +++ b/src/gpu-compute/LdsState.py @@ -0,0 +1,51 @@ +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Joe Gross +# + +from m5.defines import buildEnv +from m5.params import * +from m5.proxy import * + +from MemObject import MemObject + +class LdsState(MemObject): + type = 'LdsState' + cxx_class = 'LdsState' + cxx_header = 'gpu-compute/lds_state.hh' + size = Param.Int(65536, 'the size of the LDS') + range = Param.AddrRange('64kB', "address space of the LDS") + bankConflictPenalty = Param.Int(1, 'penalty per LDS bank conflict when '\ + 'accessing data') + banks = Param.Int(32, 'Number of LDS banks') + cuPort = SlavePort("port that goes to the compute unit") diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript new file mode 100644 index 000000000..2de96df24 --- /dev/null +++ b/src/gpu-compute/SConscript @@ -0,0 +1,99 @@ +# -*- mode:python -*- + +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Anthony Gutierrez +# + +Import('*') + +if not env['BUILD_GPU']: + Return() + +SimObject('GPU.py') +SimObject('LdsState.py') +SimObject('X86GPUTLB.py') + +if env['TARGET_GPU_ISA'] == 'hsail': + Source('brig_object.cc') + Source('hsail_code.cc') + +Source('cl_driver.cc') +Source('compute_unit.cc') +Source('condition_register_state.cc') +Source('dispatcher.cc') +Source('exec_stage.cc') +Source('fetch_stage.cc') +Source('fetch_unit.cc') +Source('global_memory_pipeline.cc') +Source('gpu_dyn_inst.cc') +Source('gpu_exec_context.cc') +Source('gpu_static_inst.cc') +Source('gpu_tlb.cc') +Source('hsa_object.cc') +Source('kernel_cfg.cc') +Source('lds_state.cc') +Source('local_memory_pipeline.cc') +Source('of_scheduling_policy.cc') +Source('pool_manager.cc') +Source('rr_scheduling_policy.cc') +Source('schedule_stage.cc') +Source('scheduler.cc') +Source('scoreboard_check_stage.cc') +Source('shader.cc') +Source('simple_pool_manager.cc') +Source('tlb_coalescer.cc') +Source('vector_register_file.cc') +Source('vector_register_state.cc') +Source('wavefront.cc') + +DebugFlag('BRIG') +DebugFlag('GPUCoalescer') +DebugFlag('GPUDisp') +DebugFlag('GPUExec') +DebugFlag('GPUFetch') +DebugFlag('GPUHsailCFInfo') +DebugFlag('GPUMem') +DebugFlag('GPUPort') +DebugFlag('GPUPrefetch') +DebugFlag('GPUReg') +DebugFlag('GPUSync') +DebugFlag('GPUTLB') +DebugFlag('HSALoader') +DebugFlag('HSAIL') +DebugFlag('HSAILObject') +DebugFlag('Predictor') +DebugFlag('WavefrontStack') + +CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch', + 'GPUMem', 'GPUPort', 'GPUSync', 'GPUTLB', 'HSAIL']) diff --git a/src/gpu-compute/X86GPUTLB.py b/src/gpu-compute/X86GPUTLB.py new file mode 100644 index 000000000..51f8e514e --- /dev/null +++ b/src/gpu-compute/X86GPUTLB.py @@ -0,0 +1,77 @@ +# +# Copyright (c) 2011-2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Lisa Hsu +# + +from m5.defines import buildEnv +from m5.params import * +from m5.proxy import * + +from m5.objects.MemObject import MemObject + +if buildEnv['FULL_SYSTEM']: + class X86PagetableWalker(MemObject): + type = 'X86PagetableWalker' + cxx_class = 'X86ISA::Walker' + port = SlavePort("Port for the hardware table walker") + system = Param.System(Parent.any, "system object") + +class X86GPUTLB(MemObject): + type = 'X86GPUTLB' + cxx_class = 'X86ISA::GpuTLB' + cxx_header = 'gpu-compute/gpu_tlb.hh' + size = Param.Int(64, "TLB size (number of entries)") + assoc = Param.Int(64, "TLB associativity") + + if buildEnv['FULL_SYSTEM']: + walker = Param.X86PagetableWalker(X86PagetableWalker(), + "page table walker") + + hitLatency = Param.Int(2, "Latency of a TLB hit") + missLatency1 = Param.Int(5, "Latency #1 of a TLB miss") + missLatency2 = Param.Int(100, "Latency #2 of a TLB miss") + maxOutstandingReqs = Param.Int(64, "# of maximum outstanding requests") + slave = VectorSlavePort("Port on side closer to CPU/CU") + master = VectorMasterPort("Port on side closer to memory") + allocationPolicy = Param.Bool(True, "Allocate on an access") + accessDistance = Param.Bool(False, "print accessDistance stats") + +class TLBCoalescer(MemObject): + type = 'TLBCoalescer' + cxx_class = 'TLBCoalescer' + cxx_header = 'gpu-compute/tlb_coalescer.hh' + probesPerCycle = Param.Int(2, "Number of TLB probes per cycle") + coalescingWindow = Param.Int(1, "Permit coalescing across that many ticks") + slave = VectorSlavePort("Port on side closer to CPU/CU") + master = VectorMasterPort("Port on side closer to memory") + disableCoalescing = Param.Bool(False,"Dispable Coalescing") diff --git a/src/gpu-compute/brig_object.cc b/src/gpu-compute/brig_object.cc new file mode 100644 index 000000000..7cc9b7cc4 --- /dev/null +++ b/src/gpu-compute/brig_object.cc @@ -0,0 +1,474 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt, Anthony Gutierrez + */ + +#include "gpu-compute/brig_object.hh" + +#include +#include +#include +#include + +#include +#include +#include + +#include "arch/hsail/Brig.h" +#include "base/misc.hh" +#include "base/trace.hh" +#include "debug/BRIG.hh" +#include "debug/HSAILObject.hh" +#include "debug/HSALoader.hh" + +using namespace Brig; + +std::vector> + HsaObject::tryFileFuncs = { BrigObject::tryFile }; + +extern int getBrigDataTypeBytes(BrigType16_t t); + +const char *BrigObject::sectionNames[] = +{ + "hsa_data", + "hsa_code", + "hsa_operand", + ".shstrtab" +}; + +const char *segmentNames[] = +{ + "none", + "flat", + "global", + "readonly", + "kernarg", + "group", + "private", + "spill", + "args" +}; + +const uint8_t* +BrigObject::getSectionOffset(enum SectionIndex sec, int offs) const +{ + // allow offs == size for dummy end pointers + assert(offs <= sectionInfo[sec].size); + + return sectionInfo[sec].ptr + offs; +} + +const char* +BrigObject::getString(int offs) const +{ + return (const char*)(getSectionOffset(DataSectionIndex, offs) + 4); +} + +const BrigBase* +BrigObject::getCodeSectionEntry(int offs) const +{ + return (const BrigBase*)getSectionOffset(CodeSectionIndex, offs); +} + +const BrigData* +BrigObject::getBrigBaseData(int offs) const +{ + return (Brig::BrigData*)(getSectionOffset(DataSectionIndex, offs)); +} + +const uint8_t* +BrigObject::getData(int offs) const +{ + return getSectionOffset(DataSectionIndex, offs); +} + +const BrigOperand* +BrigObject::getOperand(int offs) const +{ + return (const BrigOperand*)getSectionOffset(OperandsSectionIndex, offs); +} + +unsigned +BrigObject::getOperandPtr(int offs, int index) const +{ + unsigned *op_offs = (unsigned*)(getData(offs + 4 * (index + 1))); + + return *op_offs; +} + +const BrigInstBase* +BrigObject::getInst(int offs) const +{ + return (const BrigInstBase*)getSectionOffset(CodeSectionIndex, offs); +} + +HsaCode* +BrigObject::getKernel(const std::string &name) const +{ + return nullptr; +} + +HsaCode* +BrigObject::getFunction(const std::string &name) const +{ + for (int i = 0; i < functions.size(); ++i) { + if (functions[i]->name() == name) { + return functions[i]; + } + } + + return nullptr; +} + +void +BrigObject::processDirectives(const BrigBase *dirPtr, const BrigBase *endPtr, + StorageMap *storageMap) +{ + while (dirPtr < endPtr) { + if (!dirPtr->byteCount) { + fatal("Bad directive size 0\n"); + } + + // calculate next pointer now so we can override it if needed + const BrigBase *nextDirPtr = brigNext(dirPtr); + + DPRINTF(HSAILObject, "Code section entry kind: #%x, byte count: %d\n", + dirPtr->kind, dirPtr->byteCount); + + switch (dirPtr->kind) { + case BRIG_KIND_DIRECTIVE_FUNCTION: + { + const BrigDirectiveExecutable *p M5_VAR_USED = + reinterpret_cast(dirPtr); + + DPRINTF(HSAILObject,"DIRECTIVE_FUNCTION: %s offset: " + "%d next: %d\n", getString(p->name), + p->firstCodeBlockEntry, p->nextModuleEntry); + + if (p->firstCodeBlockEntry != p->nextModuleEntry) { + panic("Function calls are not fully supported yet!!: %s\n", + getString(p->name)); + + const char *name = getString(p->name); + + HsailCode *code_obj = nullptr; + + for (int i = 0; i < functions.size(); ++i) { + if (functions[i]->name() == name) { + code_obj = functions[i]; + break; + } + } + + if (!code_obj) { + // create new local storage map for kernel-local symbols + code_obj = new HsailCode(name, p, this, + new StorageMap(storageMap)); + functions.push_back(code_obj); + } else { + panic("Multiple definition of Function!!: %s\n", + getString(p->name)); + } + + } + nextDirPtr = getCodeSectionEntry(p->nextModuleEntry); + } + break; + + case BRIG_KIND_DIRECTIVE_KERNEL: + { + const BrigDirectiveExecutable *p = + reinterpret_cast(dirPtr); + + DPRINTF(HSAILObject,"DIRECTIVE_KERNEL: %s offset: %d count: " + "next: %d\n", getString(p->name), + p->firstCodeBlockEntry, p->nextModuleEntry); + + const char *name = getString(p->name); + + if (name[0] == '&') + name++; + + std::string str = name; + char *temp; + int len = str.length(); + + if (str[len - 1] >= 'a' && str[len - 1] <= 'z') { + temp = new char[str.size() + 1]; + std::copy(str.begin(), str.end() , temp); + temp[str.size()] = '\0'; + } else { + temp = new char[str.size()]; + std::copy(str.begin(), str.end() - 1 , temp); + temp[str.size() - 1 ] = '\0'; + } + + std::string kernel_name = temp; + delete[] temp; + + HsailCode *code_obj = nullptr; + + for (const auto &kernel : kernels) { + if (kernel->name() == kernel_name) { + code_obj = kernel; + break; + } + } + + if (!code_obj) { + // create new local storage map for kernel-local symbols + code_obj = new HsailCode(kernel_name, p, this, + new StorageMap(storageMap)); + + kernels.push_back(code_obj); + } + + nextDirPtr = getCodeSectionEntry(p->nextModuleEntry); + } + break; + + case BRIG_KIND_DIRECTIVE_VARIABLE: + { + const BrigDirectiveVariable *p = + reinterpret_cast(dirPtr); + + uint64_t readonlySize_old = + storageMap->getSize(BRIG_SEGMENT_READONLY); + + StorageElement* se = storageMap->addSymbol(p, this); + + DPRINTF(HSAILObject, "DIRECTIVE_VARIABLE, symbol %s\n", + getString(p->name)); + + if (p->segment == BRIG_SEGMENT_READONLY) { + // readonly memory has initialization data + uint8_t* readonlyData_old = readonlyData; + + readonlyData = + new uint8_t[storageMap->getSize(BRIG_SEGMENT_READONLY)]; + + if (p->init) { + if ((p->type == BRIG_TYPE_ROIMG) || + (p->type == BRIG_TYPE_WOIMG) || + (p->type == BRIG_TYPE_SAMP) || + (p->type == BRIG_TYPE_SIG32) || + (p->type == BRIG_TYPE_SIG64)) { + panic("Read only data type not supported: %s\n", + getString(p->name)); + } + + const BrigOperand *brigOp = getOperand(p->init); + assert(brigOp->kind == + BRIG_KIND_OPERAND_CONSTANT_BYTES); + + const Brig::BrigData *operand_data M5_VAR_USED = + getBrigBaseData(((BrigOperandConstantBytes*) + brigOp)->bytes); + + assert((operand_data->byteCount / 4) > 0); + + uint8_t *symbol_data = + (uint8_t*)getData(((BrigOperandConstantBytes*) + brigOp)->bytes + 4); + + // copy the old data and add the new data + if (readonlySize_old > 0) { + memcpy(readonlyData, readonlyData_old, + readonlySize_old); + } + + memcpy(readonlyData + se->offset, symbol_data, + se->size); + + delete[] readonlyData_old; + } + } + } + break; + + case BRIG_KIND_DIRECTIVE_LABEL: + { + const BrigDirectiveLabel M5_VAR_USED *p = + reinterpret_cast(dirPtr); + + panic("Label directives cannot be at the module level: %s\n", + getString(p->name)); + + } + break; + + case BRIG_KIND_DIRECTIVE_COMMENT: + { + const BrigDirectiveComment M5_VAR_USED *p = + reinterpret_cast(dirPtr); + + DPRINTF(HSAILObject, "DIRECTIVE_COMMENT: %s\n", + getString(p->name)); + } + break; + + case BRIG_KIND_DIRECTIVE_LOC: + { + DPRINTF(HSAILObject, "BRIG_DIRECTIVE_LOC\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_MODULE: + { + const BrigDirectiveModule M5_VAR_USED *p = + reinterpret_cast(dirPtr); + + DPRINTF(HSAILObject, "BRIG_DIRECTIVE_MODULE: %s\n", + getString(p->name)); + } + break; + + case BRIG_KIND_DIRECTIVE_CONTROL: + { + DPRINTF(HSAILObject, "DIRECTIVE_CONTROL\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_PRAGMA: + { + DPRINTF(HSAILObject, "DIRECTIVE_PRAGMA\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_EXTENSION: + { + DPRINTF(HSAILObject, "DIRECTIVE_EXTENSION\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START: + { + DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_START\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END: + { + DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_END\n"); + } + break; + default: + if (dirPtr->kind >= BRIG_KIND_INST_BEGIN && + dirPtr->kind <= BRIG_KIND_INST_END) + break; + + if (dirPtr->kind >= BRIG_KIND_OPERAND_BEGIN && + dirPtr->kind <= BRIG_KIND_OPERAND_END) + break; + + warn("Unknown Brig directive kind: %d\n", dirPtr->kind); + break; + } + + dirPtr = nextDirPtr; + } +} + +HsaObject* +BrigObject::tryFile(const std::string &fname, int len, uint8_t *fileData) +{ + const char *brig_ident = "HSA BRIG"; + + if (memcmp(brig_ident, fileData, MODULE_IDENTIFICATION_LENGTH)) + return nullptr; + + return new BrigObject(fname, len, fileData); +} + +BrigObject::BrigObject(const std::string &fname, int len, uint8_t *fileData) + : HsaObject(fname), storageMap(new StorageMap()) +{ + const char *brig_ident = "HSA BRIG"; + BrigModuleHeader *mod_hdr = (BrigModuleHeader*)fileData; + + fatal_if(memcmp(brig_ident, mod_hdr, MODULE_IDENTIFICATION_LENGTH), + "%s is not a BRIG file\n", fname); + + if (mod_hdr->brigMajor != BRIG_VERSION_BRIG_MAJOR || + mod_hdr->brigMinor != BRIG_VERSION_BRIG_MINOR) { + fatal("%s: BRIG version mismatch, %d.%d != %d.%d\n", + fname, mod_hdr->brigMajor, mod_hdr->brigMinor, + BRIG_VERSION_BRIG_MAJOR, BRIG_VERSION_BRIG_MINOR); + } + + fatal_if(mod_hdr->sectionCount != NumSectionIndices, "%s: BRIG section " + "count (%d) != expected value (%d)\n", fname, + mod_hdr->sectionCount, NumSectionIndices); + + for (int i = 0; i < NumSectionIndices; ++i) { + sectionInfo[i].ptr = nullptr; + } + + uint64_t *sec_idx_table = (uint64_t*)(fileData + mod_hdr->sectionIndex); + for (int sec_idx = 0; sec_idx < mod_hdr->sectionCount; ++sec_idx) { + uint8_t *sec_hdr_byte_ptr = fileData + sec_idx_table[sec_idx]; + BrigSectionHeader *sec_hdr = (BrigSectionHeader*)sec_hdr_byte_ptr; + + // It doesn't look like cprintf supports string precision values, + // but if this breaks, the right answer is to fix that + DPRINTF(HSAILObject, "found section %.*s\n", sec_hdr->nameLength, + sec_hdr->name); + + sectionInfo[sec_idx].ptr = new uint8_t[sec_hdr->byteCount]; + memcpy(sectionInfo[sec_idx].ptr, sec_hdr_byte_ptr, sec_hdr->byteCount); + sectionInfo[sec_idx].size = sec_hdr->byteCount; + } + + BrigSectionHeader *code_hdr = + (BrigSectionHeader*)sectionInfo[CodeSectionIndex].ptr; + + DPRINTF(HSAILObject, "Code section hdr, count: %d, hdr count: %d, " + "name len: %d\n", code_hdr->byteCount, code_hdr->headerByteCount, + code_hdr->nameLength); + + // start at offset 4 to skip initial null entry (see Brig spec) + processDirectives(getCodeSectionEntry(code_hdr->headerByteCount), + getCodeSectionEntry(sectionInfo[CodeSectionIndex].size), + storageMap); + + delete[] fileData; + + DPRINTF(HSALoader, "BRIG object %s loaded.\n", fname); +} + +BrigObject::~BrigObject() +{ + for (int i = 0; i < NumSectionIndices; ++i) + if (sectionInfo[i].ptr) + delete[] sectionInfo[i].ptr; +} diff --git a/src/gpu-compute/brig_object.hh b/src/gpu-compute/brig_object.hh new file mode 100644 index 000000000..59a585914 --- /dev/null +++ b/src/gpu-compute/brig_object.hh @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt, Anthony Gutierrez + */ + +#ifndef __BRIG_OBJECT_HH__ +#define __BRIG_OBJECT_HH__ + +#include +#include +#include +#include + +#include "arch/hsail/Brig.h" +#include "gpu-compute/hsa_object.hh" +#include "gpu-compute/hsail_code.hh" + +class LabelMap; +class StorageMap; + +/* @class BrigObject + * this class implements the BRIG loader object, and + * is used when the simulator directly executes HSAIL. + * this class is responsible for extracting all + * information about kernels contained in BRIG format + * and converts them to HsailCode objects that are + * usable by the simulator and emulated runtime. + */ + +class BrigObject final : public HsaObject +{ + public: + enum SectionIndex + { + DataSectionIndex, + CodeSectionIndex, + OperandsSectionIndex, + NumSectionIndices + }; + + static const char *sectionNames[]; + + struct SectionInfo + { + uint8_t *ptr; + int size; + }; + + static HsaObject* tryFile(const std::string &fname, int len, + uint8_t *fileData); + + SectionInfo sectionInfo[NumSectionIndices]; + const uint8_t *getSectionOffset(enum SectionIndex sec, int offs) const; + + std::vector kernels; + std::vector functions; + std::string kern_block_name; + + void processDirectives(const Brig::BrigBase *dirPtr, + const Brig::BrigBase *endPtr, + StorageMap *storageMap); + + BrigObject(const std::string &fname, int len, uint8_t *fileData); + ~BrigObject(); + + // eventually these will need to be per-kernel not per-object-file + StorageMap *storageMap; + LabelMap *labelMap; + + const char* getString(int offs) const; + const Brig::BrigData* getBrigBaseData(int offs) const; + const uint8_t* getData(int offs) const; + const Brig::BrigBase* getCodeSectionEntry(int offs) const; + const Brig::BrigOperand* getOperand(int offs) const; + unsigned getOperandPtr(int offs, int index) const; + const Brig::BrigInstBase* getInst(int offs) const; + + HsaCode* getKernel(const std::string &name) const override; + HsaCode* getFunction(const std::string &name) const override; + + int numKernels() const override { return kernels.size(); } + + HsaCode* getKernel(int i) const override { return kernels[i]; } + + // pointer to the current kernel/function we're processing, so elements + // under construction can reference it. kinda ugly, but easier + // than passing it all over for the few places it's needed. + mutable HsailCode *currentCode; +}; + +// Utility function to bump Brig item pointer to next element given +// item size in bytes. Really just an add but with lots of casting. +template +T* +brigNext(T *ptr) +{ + Brig::BrigBase *base_ptr = (Brig::BrigBase*)ptr; + int size = base_ptr->byteCount; + assert(size); + + return (T*)((uint8_t*)ptr + size); +} + +#endif // __BRIG_OBJECT_HH__ diff --git a/src/gpu-compute/cl_driver.cc b/src/gpu-compute/cl_driver.cc new file mode 100644 index 000000000..3b3291c03 --- /dev/null +++ b/src/gpu-compute/cl_driver.cc @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "gpu-compute/cl_driver.hh" + +#include "base/intmath.hh" +#include "cpu/thread_context.hh" +#include "gpu-compute/dispatcher.hh" +#include "gpu-compute/hsa_code.hh" +#include "gpu-compute/hsa_kernel_info.hh" +#include "gpu-compute/hsa_object.hh" +#include "params/ClDriver.hh" +#include "sim/process.hh" +#include "sim/syscall_emul_buf.hh" + +ClDriver::ClDriver(ClDriverParams *p) + : EmulatedDriver(p), hsaCode(0) +{ + for (const auto &codeFile : p->codefile) + codeFiles.push_back(&codeFile); + + maxFuncArgsSize = 0; + + for (int i = 0; i < codeFiles.size(); ++i) { + HsaObject *obj = HsaObject::createHsaObject(*codeFiles[i]); + + for (int k = 0; k < obj->numKernels(); ++k) { + assert(obj->getKernel(k)); + kernels.push_back(obj->getKernel(k)); + kernels.back()->setReadonlyData((uint8_t*)obj->readonlyData); + int kern_funcargs_size = kernels.back()->funcarg_size; + maxFuncArgsSize = maxFuncArgsSize < kern_funcargs_size ? + kern_funcargs_size : maxFuncArgsSize; + } + } + + int name_offs = 0; + int code_offs = 0; + + for (int i = 0; i < kernels.size(); ++i) { + kernelInfo.push_back(HsaKernelInfo()); + HsaCode *k = kernels[i]; + + k->generateHsaKernelInfo(&kernelInfo[i]); + + kernelInfo[i].name_offs = name_offs; + kernelInfo[i].code_offs = code_offs; + + name_offs += k->name().size() + 1; + code_offs += k->numInsts() * sizeof(GPUStaticInst*); + } +} + +void +ClDriver::handshake(GpuDispatcher *_dispatcher) +{ + dispatcher = _dispatcher; + dispatcher->setFuncargsSize(maxFuncArgsSize); +} + +int +ClDriver::open(LiveProcess *p, ThreadContext *tc, int mode, int flags) +{ + int fd = p->allocFD(-1, filename, 0, 0, false); + FDEntry *fde = p->getFDEntry(fd); + fde->driver = this; + + return fd; +} + +int +ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req) +{ + int index = 2; + Addr buf_addr = process->getSyscallArg(tc, index); + + switch (req) { + case HSA_GET_SIZES: + { + TypedBufferArg sizes(buf_addr); + sizes->num_kernels = kernels.size(); + sizes->string_table_size = 0; + sizes->code_size = 0; + sizes->readonly_size = 0; + + if (kernels.size() > 0) { + // all kernels will share the same read-only memory + sizes->readonly_size = + kernels[0]->getSize(HsaCode::MemorySegment::READONLY); + // check our assumption + for (int i = 1; ireadonly_size == + kernels[i]->getSize(HsaCode::MemorySegment::READONLY)); + } + } + + for (int i = 0; i < kernels.size(); ++i) { + HsaCode *k = kernels[i]; + // add one for terminating '\0' + sizes->string_table_size += k->name().size() + 1; + sizes->code_size += k->numInsts() * sizeof(GPUStaticInst*); + } + + sizes.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_KINFO: + { + TypedBufferArg + kinfo(buf_addr, sizeof(HsaKernelInfo) * kernels.size()); + + for (int i = 0; i < kernels.size(); ++i) { + HsaKernelInfo *ki = &kinfo[i]; + ki->name_offs = kernelInfo[i].name_offs; + ki->code_offs = kernelInfo[i].code_offs; + ki->sRegCount = kernelInfo[i].sRegCount; + ki->dRegCount = kernelInfo[i].dRegCount; + ki->cRegCount = kernelInfo[i].cRegCount; + ki->static_lds_size = kernelInfo[i].static_lds_size; + ki->private_mem_size = kernelInfo[i].private_mem_size; + ki->spill_mem_size = kernelInfo[i].spill_mem_size; + } + + kinfo.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_STRINGS: + { + int string_table_size = 0; + for (int i = 0; i < kernels.size(); ++i) { + HsaCode *k = kernels[i]; + string_table_size += k->name().size() + 1; + } + + BufferArg buf(buf_addr, string_table_size); + char *bufp = (char*)buf.bufferPtr(); + + for (int i = 0; i < kernels.size(); ++i) { + HsaCode *k = kernels[i]; + const char *n = k->name().c_str(); + + // idiomatic string copy + while ((*bufp++ = *n++)); + } + + assert(bufp - (char *)buf.bufferPtr() == string_table_size); + + buf.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_READONLY_DATA: + { + // we can pick any kernel --- they share the same + // readonly segment (this assumption is checked in GET_SIZES) + uint64_t size = + kernels.back()->getSize(HsaCode::MemorySegment::READONLY); + BufferArg data(buf_addr, size); + char *datap = (char *)data.bufferPtr(); + memcpy(datap, + kernels.back()->readonly_data, + size); + data.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_CODE: + { + // set hsaCode pointer + hsaCode = buf_addr; + int code_size = 0; + + for (int i = 0; i < kernels.size(); ++i) { + HsaCode *k = kernels[i]; + code_size += k->numInsts() * sizeof(TheGpuISA::RawMachInst); + } + + TypedBufferArg buf(buf_addr, code_size); + TheGpuISA::RawMachInst *bufp = buf; + + int buf_idx = 0; + + for (int i = 0; i < kernels.size(); ++i) { + HsaCode *k = kernels[i]; + + for (int j = 0; j < k->numInsts(); ++j) { + bufp[buf_idx] = k->insts()->at(j); + ++buf_idx; + } + } + + buf.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_CU_CNT: + { + BufferArg buf(buf_addr, sizeof(uint32_t)); + *((uint32_t*)buf.bufferPtr()) = dispatcher->getNumCUs(); + buf.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_VSZ: + { + BufferArg buf(buf_addr, sizeof(uint32_t)); + *((uint32_t*)buf.bufferPtr()) = VSZ; + buf.copyOut(tc->getMemProxy()); + } + break; + + default: + fatal("ClDriver: bad ioctl %d\n", req); + } + + return 0; +} + +const char* +ClDriver::codeOffToKernelName(uint64_t code_ptr) +{ + assert(hsaCode); + uint32_t code_offs = code_ptr - hsaCode; + + for (int i = 0; i < kernels.size(); ++i) { + if (code_offs == kernelInfo[i].code_offs) { + return kernels[i]->name().c_str(); + } + } + + return nullptr; +} + +ClDriver* +ClDriverParams::create() +{ + return new ClDriver(this); +} diff --git a/src/gpu-compute/cl_driver.hh b/src/gpu-compute/cl_driver.hh new file mode 100644 index 000000000..03567bab5 --- /dev/null +++ b/src/gpu-compute/cl_driver.hh @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __CL_DRIVER_HH__ +#define __CL_DRIVER_HH__ + +#include + +#include "gpu-compute/hsa_kernel_info.hh" +#include "sim/emul_driver.hh" + +class GpuDispatcher; +class HsaCode; +class LiveProcess; +class ThreadContext; + +struct ClDriverParams; + +class ClDriver final : public EmulatedDriver +{ + public: + ClDriver(ClDriverParams *p); + void handshake(GpuDispatcher *_dispatcher); + int open(LiveProcess *p, ThreadContext *tc, int mode, int flags); + int ioctl(LiveProcess *p, ThreadContext *tc, unsigned req); + const char* codeOffToKernelName(uint64_t code_ptr); + + private: + GpuDispatcher *dispatcher; + + std::vector codeFiles; + + // All the kernels we know about + std::vector kernels; + std::vector functions; + + std::vector kernelInfo; + + // maximum size necessary for function arguments + int maxFuncArgsSize; + // The host virtual address for the kernel code + uint64_t hsaCode; +}; + +#endif // __CL_DRIVER_HH__ diff --git a/src/gpu-compute/cl_event.hh b/src/gpu-compute/cl_event.hh new file mode 100644 index 000000000..75297a2d2 --- /dev/null +++ b/src/gpu-compute/cl_event.hh @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Marc Orr + */ + +#ifndef __GPU_CL_EVENT_HH__ +#define __GPU_CL_EVENT_HH__ + +struct HsaQueueEntry; + +class _cl_event { + public: + _cl_event() : done(false), hsaTaskPtr(nullptr), start(0), end(0) { } + + volatile bool done; + HsaQueueEntry *hsaTaskPtr; + uint64_t start; + uint64_t end; +}; + +#endif // __GPU_CL_EVENT_HH__ diff --git a/src/gpu-compute/code_enums.hh b/src/gpu-compute/code_enums.hh new file mode 100644 index 000000000..126cf6c50 --- /dev/null +++ b/src/gpu-compute/code_enums.hh @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __CODE_ENUMS_HH__ +#define __CODE_ENUMS_HH__ + +#define IS_OT_GLOBAL(a) ((a)>=Enums::OT_GLOBAL_READ \ + && (a)<=Enums::OT_GLOBAL_LDAS) +#define IS_OT_SHARED(a) ((a)>=Enums::OT_SHARED_READ \ + && (a)<=Enums::OT_SHARED_LDAS) +#define IS_OT_PRIVATE(a) ((a)>=Enums::OT_PRIVATE_READ \ + && (a)<=Enums::OT_PRIVATE_LDAS) +#define IS_OT_SPILL(a) ((a)>=Enums::OT_SPILL_READ \ + && (a)<=Enums::OT_SPILL_LDAS) +#define IS_OT_READONLY(a) ((a)>=Enums::OT_READONLY_READ \ + && (a)<=Enums::OT_READONLY_LDAS) +#define IS_OT_FLAT(a) ((a)>=Enums::OT_FLAT_READ && (a)<=Enums::OT_FLAT_LDAS) + +#define IS_OT_LDAS(a) ((a)==Enums::OT_GLOBAL_LDAS||(a)==Enums::OT_SHARED_LDAS \ + ||(a)==Enums::OT_PRIVATE_LDAS||(a)==Enums::OT_SPILL_LDAS \ + ||(a)==Enums::OT_READONLY_LDAS||(a)==Enums::OT_FLAT_LDAS) + +#define IS_OT_READ(a) ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SHARED_READ \ + ||(a)==Enums::OT_PRIVATE_READ||(a)==Enums::OT_SPILL_READ \ + ||(a)==Enums::OT_READONLY_READ||(a)==Enums::OT_FLAT_READ) + +#define IS_OT_READ_GM(a) \ + ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SPILL_READ \ + ||(a)==Enums::OT_READONLY_READ) + +#define IS_OT_READ_LM(a) ((a)==Enums::OT_SHARED_READ) + +#define IS_OT_READ_RM(a) ((a)==Enums::OT_READONLY_READ) + +#define IS_OT_READ_PM(a) ((a)==Enums::OT_PRIVATE_READ) + +#define IS_OT_WRITE(a) \ + ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SHARED_WRITE \ + ||(a)==Enums::OT_PRIVATE_WRITE||(a)==Enums::OT_SPILL_WRITE \ + ||(a)==Enums::OT_READONLY_WRITE||(a)==Enums::OT_FLAT_WRITE) + +#define IS_OT_WRITE_GM(a) \ + ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SPILL_WRITE \ + ||(a)==Enums::OT_READONLY_WRITE) + +#define IS_OT_WRITE_LM(a) ((a)==Enums::OT_SHARED_WRITE) + +#define IS_OT_WRITE_PM(a) ((a)==Enums::OT_PRIVATE_WRITE) + +#define IS_OT_ATOMIC(a) ((a)==Enums::OT_GLOBAL_ATOMIC \ + ||(a)==Enums::OT_SHARED_ATOMIC \ + ||(a)==Enums::OT_PRIVATE_ATOMIC \ + ||(a)==Enums::OT_SPILL_ATOMIC \ + ||(a)==Enums::OT_READONLY_ATOMIC \ + ||(a)==Enums::OT_FLAT_ATOMIC) + +#define IS_OT_ATOMIC_GM(a) ((a)==Enums::OT_GLOBAL_ATOMIC \ + ||(a)==Enums::OT_SPILL_ATOMIC \ + ||(a)==Enums::OT_READONLY_ATOMIC \ + ||(a)==Enums::OT_GLOBAL_MEMFENCE \ + ||(a)==Enums::OT_BOTH_MEMFENCE) + +#define IS_OT_ATOMIC_LM(a) ((a)==Enums::OT_SHARED_ATOMIC \ + ||(a)==Enums::OT_SHARED_MEMFENCE \ + ||(a)==Enums::OT_BOTH_MEMFENCE) + +#define IS_OT_ATOMIC_PM(a) ((a)==Enums::OT_PRIVATE_ATOMIC) + +#define IS_OT_HIST(a) ((a)==Enums::OT_GLOBAL_HIST \ + ||(a)==Enums::OT_SHARED_HIST \ + ||(a)==Enums::OT_PRIVATE_HIST \ + ||(a)==Enums::OT_SPILL_HIST \ + ||(a)==Enums::OT_READONLY_HIST \ + ||(a)==Enums::OT_FLAT_HIST) + +#define IS_OT_HIST_GM(a) ((a)==Enums::OT_GLOBAL_HIST \ + ||(a)==Enums::OT_SPILL_HIST \ + ||(a)==Enums::OT_READONLY_HIST) + +#define IS_OT_HIST_LM(a) ((a)==Enums::OT_SHARED_HIST) + +#define IS_OT_HIST_PM(a) ((a)==Enums::OT_PRIVATE_HIST) + +#endif // __CODE_ENUMS_HH__ diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc new file mode 100644 index 000000000..d3622007a --- /dev/null +++ b/src/gpu-compute/compute_unit.cc @@ -0,0 +1,1817 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Anthony Gutierrez + */ + +#include "gpu-compute/compute_unit.hh" + +#include "base/output.hh" +#include "debug/GPUDisp.hh" +#include "debug/GPUExec.hh" +#include "debug/GPUFetch.hh" +#include "debug/GPUMem.hh" +#include "debug/GPUPort.hh" +#include "debug/GPUPrefetch.hh" +#include "debug/GPUSync.hh" +#include "debug/GPUTLB.hh" +#include "gpu-compute/dispatcher.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/ndrange.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/simple_pool_manager.hh" +#include "gpu-compute/vector_register_file.hh" +#include "gpu-compute/wavefront.hh" +#include "mem/page_table.hh" +#include "sim/process.hh" + +ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p), + scoreboardCheckStage(p), scheduleStage(p), execStage(p), + globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0), + cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs), + spBypassPipeLength(p->spbypass_pipe_length), + dpBypassPipeLength(p->dpbypass_pipe_length), + issuePeriod(p->issue_period), + numGlbMemUnits(p->num_global_mem_pipes), + numLocMemUnits(p->num_shared_mem_pipes), + perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth), + prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type), + xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault), + functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier), + countPages(p->countPages), barrier_id(0), + vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width), + coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width), + req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()), + resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()), + _masterId(p->system->getMasterId(name() + ".ComputeUnit")), + lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize) +{ + // this check will be eliminated once we have wavefront size support added + fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ"); + // calculate how many cycles a vector load or store will need to transfer + // its data over the corresponding buses + numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t)) + / (double)vrfToCoalescerBusWidth); + + numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t)) + / coalescerToVrfBusWidth; + + lastVaddrWF.resize(numSIMDs); + wfList.resize(numSIMDs); + + for (int j = 0; j < numSIMDs; ++j) { + lastVaddrWF[j].resize(p->n_wf); + + for (int i = 0; i < p->n_wf; ++i) { + lastVaddrWF[j][i].resize(VSZ); + + wfList[j].push_back(p->wavefronts[j * p->n_wf + i]); + wfList[j][i]->setParent(this); + + for (int k = 0; k < VSZ; ++k) { + lastVaddrWF[j][i][k] = 0; + } + } + } + + lastVaddrPhase.resize(numSIMDs); + + for (int i = 0; i < numSIMDs; ++i) { + lastVaddrPhase[i] = LastVaddrWave(); + } + + lastVaddrCU = LastVaddrWave(); + + lds.setParent(this); + + if (p->execPolicy == "OLDEST-FIRST") { + exec_policy = EXEC_POLICY::OLDEST; + } else if (p->execPolicy == "ROUND-ROBIN") { + exec_policy = EXEC_POLICY::RR; + } else { + fatal("Invalid WF execution policy (CU)\n"); + } + + memPort.resize(VSZ); + + // resize the tlbPort vectorArray + int tlbPort_width = perLaneTLB ? VSZ : 1; + tlbPort.resize(tlbPort_width); + + cuExitCallback = new CUExitCallback(this); + registerExitCallback(cuExitCallback); + + xactCasLoadMap.clear(); + lastExecCycle.resize(numSIMDs, 0); + + for (int i = 0; i < vrf.size(); ++i) { + vrf[i]->setParent(this); + } + + numVecRegsPerSimd = vrf[0]->numRegs(); +} + +ComputeUnit::~ComputeUnit() +{ + // Delete wavefront slots + + for (int j = 0; j < numSIMDs; ++j) + for (int i = 0; i < shader->n_wf; ++i) { + delete wfList[j][i]; + } + + readyList.clear(); + waveStatusList.clear(); + dispatchList.clear(); + vectorAluInstAvail.clear(); + delete cuExitCallback; + delete ldsPort; +} + +void +ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr) +{ + w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount); + + w->workgroupsz[0] = ndr->q.wgSize[0]; + w->workgroupsz[1] = ndr->q.wgSize[1]; + w->workgroupsz[2] = ndr->q.wgSize[2]; + w->wg_sz = w->workgroupsz[0] * w->workgroupsz[1] * w->workgroupsz[2]; + w->gridsz[0] = ndr->q.gdSize[0]; + w->gridsz[1] = ndr->q.gdSize[1]; + w->gridsz[2] = ndr->q.gdSize[2]; + w->kernelArgs = ndr->q.args; + w->privSizePerItem = ndr->q.privMemPerItem; + w->spillSizePerItem = ndr->q.spillMemPerItem; + w->roBase = ndr->q.roMemStart; + w->roSize = ndr->q.roMemTotal; +} + +void +ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt, + int trueWgSize[], int trueWgSizeTotal, + LdsChunk *ldsChunk, uint64_t origSpillMemStart) +{ + wfCtx->cnt = cnt; + + VectorMask init_mask; + init_mask.reset(); + + for (int k = 0; k < VSZ; ++k) { + if (k + cnt * VSZ < trueWgSizeTotal) + init_mask[k] = 1; + } + + wfCtx->init_mask = init_mask.to_ullong(); + wfCtx->exec_mask = init_mask.to_ullong(); + + for (int i = 0; i < VSZ; ++i) { + wfCtx->bar_cnt[i] = 0; + } + + wfCtx->max_bar_cnt = 0; + wfCtx->old_barrier_cnt = 0; + wfCtx->barrier_cnt = 0; + + wfCtx->privBase = ndr->q.privMemStart; + ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ; + + wfCtx->spillBase = ndr->q.spillMemStart; + ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ; + + wfCtx->pc = 0; + wfCtx->rpc = UINT32_MAX; + + // set the wavefront context to have a pointer to this section of the LDS + wfCtx->ldsChunk = ldsChunk; + + // WG state + wfCtx->wg_id = ndr->globalWgId; + wfCtx->barrier_id = barrier_id; + + // Kernel wide state + wfCtx->ndr = ndr; +} + +void +ComputeUnit::updateEvents() { + + if (!timestampVec.empty()) { + uint32_t vecSize = timestampVec.size(); + uint32_t i = 0; + while (i < vecSize) { + if (timestampVec[i] <= shader->tick_cnt) { + std::pair regInfo = regIdxVec[i]; + vrf[regInfo.first]->markReg(regInfo.second, sizeof(uint32_t), + statusVec[i]); + timestampVec.erase(timestampVec.begin() + i); + regIdxVec.erase(regIdxVec.begin() + i); + statusVec.erase(statusVec.begin() + i); + --vecSize; + --i; + } + ++i; + } + } + + for (int i = 0; i< numSIMDs; ++i) { + vrf[i]->updateEvents(); + } +} + + +void +ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], + int trueWgSizeTotal) +{ + static int _n_wave = 0; + int cnt = wfCtx->cnt; + NDRange *ndr = wfCtx->ndr; + + // Fill in Kernel state + FillKernelState(w, ndr); + + w->kern_id = ndr->dispatchId; + w->dynwaveid = cnt; + w->init_mask = wfCtx->init_mask; + + for (int k = 0; k < VSZ; ++k) { + w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0]; + w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1]; + w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]); + + w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] * + trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] + + w->workitemid[0][k]; + } + + w->old_barrier_cnt = wfCtx->old_barrier_cnt; + w->barrier_cnt = wfCtx->barrier_cnt; + w->barrier_slots = divCeil(trueWgSizeTotal, VSZ); + + for (int i = 0; i < VSZ; ++i) { + w->bar_cnt[i] = wfCtx->bar_cnt[i]; + } + + w->max_bar_cnt = wfCtx->max_bar_cnt; + w->privBase = wfCtx->privBase; + w->spillBase = wfCtx->spillBase; + + w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask); + + // WG state + w->wg_id = wfCtx->wg_id; + w->dispatchid = wfCtx->ndr->dispatchId; + w->workgroupid[0] = w->wg_id % ndr->numWg[0]; + w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1]; + w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]); + + w->barrier_id = wfCtx->barrier_id; + w->stalledAtBarrier = false; + + // move this from the context into the actual wavefront + w->ldsChunk = wfCtx->ldsChunk; + + int32_t refCount M5_VAR_USED = + lds.increaseRefCounter(w->dispatchid, w->wg_id); + DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n", + cu_id, w->wg_id, refCount); + + w->instructionBuffer.clear(); + + if (w->pendingFetch) + w->dropFetch = true; + + // is this the last wavefront in the workgroup + // if set the spillWidth to be the remaining work-items + // so that the vector access is correct + if ((cnt + 1) * VSZ >= trueWgSizeTotal) { + w->spillWidth = trueWgSizeTotal - (cnt * VSZ); + } else { + w->spillWidth = VSZ; + } + + DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: " + "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId); + + w->start(++_n_wave, ndr->q.code_ptr); +} + +void +ComputeUnit::StartWorkgroup(NDRange *ndr) +{ + // reserve the LDS capacity allocated to the work group + // disambiguated by the dispatch ID and workgroup ID, which should be + // globally unique + LdsChunk *ldsChunk = lds.reserveSpace(ndr->dispatchId, ndr->globalWgId, + ndr->q.ldsSize); + + // Send L1 cache acquire + // isKernel + isAcquire = Kernel Begin + if (shader->impl_kern_boundary_sync) { + GPUDynInstPtr gpuDynInst = std::make_shared(nullptr, + nullptr, + nullptr, 0); + + gpuDynInst->useContinuation = false; + gpuDynInst->memoryOrder = Enums::MEMORY_ORDER_SC_ACQUIRE; + gpuDynInst->scope = Enums::MEMORY_SCOPE_SYSTEM; + injectGlobalMemFence(gpuDynInst, true); + } + + // Get true size of workgroup (after clamping to grid size) + int trueWgSize[3]; + int trueWgSizeTotal = 1; + + for (int d = 0; d < 3; ++d) { + trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] - + ndr->wgId[d] * ndr->q.wgSize[d]); + + trueWgSizeTotal *= trueWgSize[d]; + } + + uint64_t origSpillMemStart = ndr->q.spillMemStart; + // calculate the number of 32-bit vector registers required by wavefront + int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount); + int cnt = 0; + + // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time + for (int m = 0; m < shader->n_wf * numSIMDs; ++m) { + Wavefront *w = wfList[m % numSIMDs][m / numSIMDs]; + // Check if this wavefront slot is available: + // It must be stopped and not waiting + // for a release to complete S_RETURNING + if (w->status == Wavefront::S_STOPPED) { + // if we have scheduled all work items then stop + // scheduling wavefronts + if (cnt * VSZ >= trueWgSizeTotal) + break; + + // reserve vector registers for the scheduled wavefront + assert(vectorRegsReserved[m % numSIMDs] <= numVecRegsPerSimd); + uint32_t normSize = 0; + + w->startVgprIndex = vrf[m % numSIMDs]->manager-> + allocateRegion(vregDemand, &normSize); + + w->reservedVectorRegs = normSize; + vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs; + + WFContext wfCtx; + + InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal, + ldsChunk, origSpillMemStart); + + StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal); + ++cnt; + } + } + ++barrier_id; +} + +int +ComputeUnit::ReadyWorkgroup(NDRange *ndr) +{ + // Get true size of workgroup (after clamping to grid size) + int trueWgSize[3]; + int trueWgSizeTotal = 1; + + for (int d = 0; d < 3; ++d) { + trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] - + ndr->wgId[d] * ndr->q.wgSize[d]); + + trueWgSizeTotal *= trueWgSize[d]; + DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]); + } + + DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal); + + // calculate the number of 32-bit vector registers required by each + // work item of the work group + int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount); + bool vregAvail = true; + int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ; + int freeWfSlots = 0; + // check if the total number of VGPRs required by all WFs of the WG + // fit in the VRFs of all SIMD units + assert((numWfs * vregDemandPerWI) <= (numSIMDs * numVecRegsPerSimd)); + int numMappedWfs = 0; + std::vector numWfsPerSimd; + numWfsPerSimd.resize(numSIMDs, 0); + // find how many free WF slots we have across all SIMDs + for (int j = 0; j < shader->n_wf; ++j) { + for (int i = 0; i < numSIMDs; ++i) { + if (wfList[i][j]->status == Wavefront::S_STOPPED) { + // count the number of free WF slots + ++freeWfSlots; + if (numMappedWfs < numWfs) { + // count the WFs to be assigned per SIMD + numWfsPerSimd[i]++; + } + numMappedWfs++; + } + } + } + + // if there are enough free WF slots then find if there are enough + // free VGPRs per SIMD based on the WF->SIMD mapping + if (freeWfSlots >= numWfs) { + for (int j = 0; j < numSIMDs; ++j) { + // find if there are enough free VGPR regions in the SIMD's VRF + // to accommodate the WFs of the new WG that would be mapped to + // this SIMD unit + vregAvail = vrf[j]->manager->canAllocate(numWfsPerSimd[j], + vregDemandPerWI); + + // stop searching if there is at least one SIMD + // whose VRF does not have enough free VGPR pools. + // This is because a WG is scheduled only if ALL + // of its WFs can be scheduled + if (!vregAvail) + break; + } + } + + DPRINTF(GPUDisp, "Free WF slots = %d, VGPR Availability = %d\n", + freeWfSlots, vregAvail); + + if (!vregAvail) { + ++numTimesWgBlockedDueVgprAlloc; + } + + // Return true if enough WF slots to submit workgroup and if there are + // enough VGPRs to schedule all WFs to their SIMD units + if (!lds.canReserve(ndr->q.ldsSize)) { + wgBlockedDueLdsAllocation++; + } + + // Return true if (a) there are enough free WF slots to submit + // workgrounp and (b) if there are enough VGPRs to schedule all WFs to their + // SIMD units and (c) if there is enough space in LDS + return freeWfSlots >= numWfs && vregAvail && lds.canReserve(ndr->q.ldsSize); +} + +int +ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots) +{ + DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id); + int ccnt = 0; + + for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) { + for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) { + Wavefront *w = wfList[i_simd][i_wf]; + + if (w->status == Wavefront::S_RUNNING) { + DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf); + + DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n", + w->barrier_id, _barrier_id); + + DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n", + w->barrier_cnt, bcnt); + } + + if (w->status == Wavefront::S_RUNNING && + w->barrier_id == _barrier_id && w->barrier_cnt == bcnt && + !w->outstanding_reqs) { + ++ccnt; + + DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to " + "%d\n", i_simd, i_wf, ccnt); + } + } + } + + DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n", + cu_id, ccnt, bslots); + + return ccnt == bslots; +} + +// Check if the current wavefront is blocked on additional resources. +bool +ComputeUnit::cedeSIMD(int simdId, int wfSlotId) +{ + bool cede = false; + + // If --xact-cas-mode option is enabled in run.py, then xact_cas_ld + // magic instructions will impact the scheduling of wavefronts + if (xact_cas_mode) { + /* + * When a wavefront calls xact_cas_ld, it adds itself to a per address + * queue. All per address queues are managed by the xactCasLoadMap. + * + * A wavefront is not blocked if: it is not in ANY per address queue or + * if it is at the head of a per address queue. + */ + for (auto itMap : xactCasLoadMap) { + std::list curWaveIDQueue = itMap.second.waveIDQueue; + + if (!curWaveIDQueue.empty()) { + for (auto it : curWaveIDQueue) { + waveIdentifier cur_wave = it; + + if (cur_wave.simdId == simdId && + cur_wave.wfSlotId == wfSlotId) { + // 2 possibilities + // 1: this WF has a green light + // 2: another WF has a green light + waveIdentifier owner_wave = curWaveIDQueue.front(); + + if (owner_wave.simdId != cur_wave.simdId || + owner_wave.wfSlotId != cur_wave.wfSlotId) { + // possibility 2 + cede = true; + break; + } else { + // possibility 1 + break; + } + } + } + } + } + } + + return cede; +} + +// Execute one clock worth of work on the ComputeUnit. +void +ComputeUnit::exec() +{ + updateEvents(); + // Execute pipeline stages in reverse order to simulate + // the pipeline latency + globalMemoryPipe.exec(); + localMemoryPipe.exec(); + execStage.exec(); + scheduleStage.exec(); + scoreboardCheckStage.exec(); + fetchStage.exec(); + + totalCycles++; +} + +void +ComputeUnit::init() +{ + // Initialize CU Bus models + glbMemToVrfBus.init(&shader->tick_cnt, 1); + locMemToVrfBus.init(&shader->tick_cnt, 1); + nextGlbMemBus = 0; + nextLocMemBus = 0; + fatal_if(numGlbMemUnits > 1, + "No support for multiple Global Memory Pipelines exists!!!"); + vrfToGlobalMemPipeBus.resize(numGlbMemUnits); + for (int j = 0; j < numGlbMemUnits; ++j) { + vrfToGlobalMemPipeBus[j] = WaitClass(); + vrfToGlobalMemPipeBus[j].init(&shader->tick_cnt, 1); + } + + fatal_if(numLocMemUnits > 1, + "No support for multiple Local Memory Pipelines exists!!!"); + vrfToLocalMemPipeBus.resize(numLocMemUnits); + for (int j = 0; j < numLocMemUnits; ++j) { + vrfToLocalMemPipeBus[j] = WaitClass(); + vrfToLocalMemPipeBus[j].init(&shader->tick_cnt, 1); + } + vectorRegsReserved.resize(numSIMDs, 0); + aluPipe.resize(numSIMDs); + wfWait.resize(numSIMDs + numLocMemUnits + numGlbMemUnits); + + for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) { + wfWait[i] = WaitClass(); + wfWait[i].init(&shader->tick_cnt, 1); + } + + for (int i = 0; i < numSIMDs; ++i) { + aluPipe[i] = WaitClass(); + aluPipe[i].init(&shader->tick_cnt, 1); + } + + // Setup space for call args + for (int j = 0; j < numSIMDs; ++j) { + for (int i = 0; i < shader->n_wf; ++i) { + wfList[j][i]->initCallArgMem(shader->funcargs_size); + } + } + + // Initializing pipeline resources + readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits); + waveStatusList.resize(numSIMDs); + + for (int j = 0; j < numSIMDs; ++j) { + for (int i = 0; i < shader->n_wf; ++i) { + waveStatusList[j].push_back( + std::make_pair(wfList[j][i], BLOCKED)); + } + } + + for (int j = 0; j < (numSIMDs + numGlbMemUnits + numLocMemUnits); ++j) { + dispatchList.push_back(std::make_pair((Wavefront*)nullptr, EMPTY)); + } + + fetchStage.init(this); + scoreboardCheckStage.init(this); + scheduleStage.init(this); + execStage.init(this); + globalMemoryPipe.init(this); + localMemoryPipe.init(this); + // initialize state for statistics calculation + vectorAluInstAvail.resize(numSIMDs, false); + shrMemInstAvail = 0; + glbMemInstAvail = 0; +} + +bool +ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) +{ + // Ruby has completed the memory op. Schedule the mem_resp_event at the + // appropriate cycle to process the timing memory response + // This delay represents the pipeline delay + SenderState *sender_state = safe_cast(pkt->senderState); + int index = sender_state->port_index; + GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; + + // Is the packet returned a Kernel End or Barrier + if (pkt->req->isKernel() && pkt->req->isRelease()) { + Wavefront *w = + computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId]; + + // Check if we are waiting on Kernel End Release + if (w->status == Wavefront::S_RETURNING) { + DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n", + computeUnit->cu_id, w->simdId, w->wfSlotId, + w->wfDynId, w->kern_id); + + computeUnit->shader->dispatcher->notifyWgCompl(w); + w->status = Wavefront::S_STOPPED; + } else { + w->outstanding_reqs--; + } + + DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n", + computeUnit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, w->barrier_cnt); + + if (gpuDynInst->useContinuation) { + assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), + gpuDynInst); + } + + delete pkt->senderState; + delete pkt->req; + delete pkt; + return true; + } else if (pkt->req->isKernel() && pkt->req->isAcquire()) { + if (gpuDynInst->useContinuation) { + assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), + gpuDynInst); + } + + delete pkt->senderState; + delete pkt->req; + delete pkt; + return true; + } + + ComputeUnit::DataPort::MemRespEvent *mem_resp_event = + new ComputeUnit::DataPort::MemRespEvent(computeUnit->memPort[index], + pkt); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n", + computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + index, pkt->req->getPaddr()); + + computeUnit->schedule(mem_resp_event, + curTick() + computeUnit->resp_tick_latency); + return true; +} + +void +ComputeUnit::DataPort::recvReqRetry() +{ + int len = retries.size(); + + assert(len > 0); + + for (int i = 0; i < len; ++i) { + PacketPtr pkt = retries.front().first; + GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second; + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n", + computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + pkt->req->getPaddr()); + + /** Currently Ruby can return false due to conflicts for the particular + * cache block or address. Thus other requests should be allowed to + * pass and the data port should expect multiple retries. */ + if (!sendTimingReq(pkt)) { + DPRINTF(GPUMem, "failed again!\n"); + break; + } else { + DPRINTF(GPUMem, "successful!\n"); + retries.pop_front(); + } + } +} + +bool +ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt) +{ + computeUnit->fetchStage.processFetchReturn(pkt); + + return true; +} + +void +ComputeUnit::SQCPort::recvReqRetry() +{ + int len = retries.size(); + + assert(len > 0); + + for (int i = 0; i < len; ++i) { + PacketPtr pkt = retries.front().first; + Wavefront *wavefront M5_VAR_USED = retries.front().second; + DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n", + computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, + pkt->req->getPaddr()); + if (!sendTimingReq(pkt)) { + DPRINTF(GPUFetch, "failed again!\n"); + break; + } else { + DPRINTF(GPUFetch, "successful!\n"); + retries.pop_front(); + } + } +} + +void +ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) +{ + // There must be a way around this check to do the globalMemStart... + Addr tmp_vaddr = pkt->req->getVaddr(); + + updatePageDivergenceDist(tmp_vaddr); + + pkt->req->setVirt(pkt->req->getAsid(), tmp_vaddr, pkt->req->getSize(), + pkt->req->getFlags(), pkt->req->masterId(), + pkt->req->getPC()); + + // figure out the type of the request to set read/write + BaseTLB::Mode TLB_mode; + assert(pkt->isRead() || pkt->isWrite()); + + // Check write before read for atomic operations + // since atomic operations should use BaseTLB::Write + if (pkt->isWrite()){ + TLB_mode = BaseTLB::Write; + } else if (pkt->isRead()) { + TLB_mode = BaseTLB::Read; + } else { + fatal("pkt is not a read nor a write\n"); + } + + tlbCycles -= curTick(); + ++tlbRequests; + + int tlbPort_index = perLaneTLB ? index : 0; + + if (shader->timingSim) { + if (debugSegFault) { + Process *p = shader->gpuTc->getProcessPtr(); + Addr vaddr = pkt->req->getVaddr(); + unsigned size = pkt->getSize(); + + if ((vaddr + size - 1) % 64 < vaddr % 64) { + panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n", + cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr); + } + + Addr paddr; + + if (!p->pTable->translate(vaddr, paddr)) { + if (!p->fixupStackFault(vaddr)) { + panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n", + cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + vaddr); + } + } + } + + // This is the SenderState needed upon return + pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index); + + // This is the senderState needed by the TLB hierarchy to function + TheISA::GpuTLB::TranslationState *translation_state = + new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false, + pkt->senderState); + + pkt->senderState = translation_state; + + if (functionalTLB) { + tlbPort[tlbPort_index]->sendFunctional(pkt); + + // update the hitLevel distribution + int hit_level = translation_state->hitLevel; + assert(hit_level != -1); + hitsPerTLBLevel[hit_level]++; + + // New SenderState for the memory access + X86ISA::GpuTLB::TranslationState *sender_state = + safe_cast(pkt->senderState); + + delete sender_state->tlbEntry; + delete sender_state->saved; + delete sender_state; + + assert(pkt->req->hasPaddr()); + assert(pkt->req->hasSize()); + + uint8_t *tmpData = pkt->getPtr(); + + // this is necessary because the GPU TLB receives packets instead + // of requests. when the translation is complete, all relevent + // fields in the request will be populated, but not in the packet. + // here we create the new packet so we can set the size, addr, + // and proper flags. + PacketPtr oldPkt = pkt; + pkt = new Packet(oldPkt->req, oldPkt->cmd); + delete oldPkt; + pkt->dataStatic(tmpData); + + + // New SenderState for the memory access + pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, + index, nullptr); + + gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index); + gpuDynInst->tlbHitLevel[index] = hit_level; + + + // translation is done. Schedule the mem_req_event at the + // appropriate cycle to send the timing memory request to ruby + ComputeUnit::DataPort::MemReqEvent *mem_req_event = + new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data " + "scheduled\n", cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, index, pkt->req->getPaddr()); + + schedule(mem_req_event, curTick() + req_tick_latency); + } else if (tlbPort[tlbPort_index]->isStalled()) { + assert(tlbPort[tlbPort_index]->retries.size() > 0); + + DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x " + "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + tmp_vaddr); + + tlbPort[tlbPort_index]->retries.push_back(pkt); + } else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) { + // Stall the data port; + // No more packet will be issued till + // ruby indicates resources are freed by + // a recvReqRetry() call back on this port. + tlbPort[tlbPort_index]->stallPort(); + + DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x " + "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + tmp_vaddr); + + tlbPort[tlbPort_index]->retries.push_back(pkt); + } else { + DPRINTF(GPUTLB, + "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n", + cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr); + } + } else { + if (pkt->cmd == MemCmd::MemFenceReq) { + gpuDynInst->statusBitVector = VectorMask(0); + } else { + gpuDynInst->statusBitVector &= (~(1ll << index)); + } + + // New SenderState for the memory access + delete pkt->senderState; + + // Because it's atomic operation, only need TLB translation state + pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode, + shader->gpuTc); + + tlbPort[tlbPort_index]->sendFunctional(pkt); + + // the addr of the packet is not modified, so we need to create a new + // packet, or otherwise the memory access will have the old virtual + // address sent in the translation packet, instead of the physical + // address returned by the translation. + PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd); + new_pkt->dataStatic(pkt->getPtr()); + + // Translation is done. It is safe to send the packet to memory. + memPort[0]->sendFunctional(new_pkt); + + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id, + gpuDynInst->simdId, gpuDynInst->wfSlotId, index, + new_pkt->req->getPaddr()); + + // safe_cast the senderState + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast(pkt->senderState); + + delete sender_state->tlbEntry; + delete new_pkt; + delete pkt->senderState; + delete pkt->req; + delete pkt; + } +} + +void +ComputeUnit::sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) +{ + ComputeUnit::DataPort::MemReqEvent *mem_req_event = + new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt); + + + // New SenderState for the memory access + pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index, + nullptr); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n", + cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index, + pkt->req->getPaddr()); + + schedule(mem_req_event, curTick() + req_tick_latency); +} + +void +ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch, + Request* req) +{ + if (!req) { + req = new Request(0, 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId, -1); + } + req->setPaddr(0); + if (kernelLaunch) { + req->setFlags(Request::KERNEL); + } + + gpuDynInst->s_type = SEG_GLOBAL; + + // for non-kernel MemFence operations, memorder flags are set depending + // on which type of request is currently being sent, so this + // should be set by the caller (e.g. if an inst has acq-rel + // semantics, it will send one acquire req an one release req) + gpuDynInst->setRequestFlags(req, kernelLaunch); + + // a mem fence must correspond to an acquire/release request + assert(req->isAcquire() || req->isRelease()); + + // create packet + PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq); + + // set packet's sender state + pkt->senderState = + new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr); + + // send the packet + sendSyncRequest(gpuDynInst, 0, pkt); +} + +const char* +ComputeUnit::DataPort::MemRespEvent::description() const +{ + return "ComputeUnit memory response event"; +} + +void +ComputeUnit::DataPort::MemRespEvent::process() +{ + DataPort::SenderState *sender_state = + safe_cast(pkt->senderState); + + GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; + ComputeUnit *compute_unit = dataPort->computeUnit; + + assert(gpuDynInst); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n", + compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + pkt->req->getPaddr(), dataPort->index); + + Addr paddr = pkt->req->getPaddr(); + + if (pkt->cmd != MemCmd::MemFenceResp) { + int index = gpuDynInst->memStatusVector[paddr].back(); + + DPRINTF(GPUMem, "Response for addr %#x, index %d\n", + pkt->req->getPaddr(), index); + + gpuDynInst->memStatusVector[paddr].pop_back(); + gpuDynInst->pAddr = pkt->req->getPaddr(); + + if (pkt->isRead() || pkt->isWrite()) { + + if (gpuDynInst->n_reg <= MAX_REGS_FOR_NON_VEC_MEM_INST) { + gpuDynInst->statusBitVector &= (~(1ULL << index)); + } else { + assert(gpuDynInst->statusVector[index] > 0); + gpuDynInst->statusVector[index]--; + + if (!gpuDynInst->statusVector[index]) + gpuDynInst->statusBitVector &= (~(1ULL << index)); + } + + DPRINTF(GPUMem, "bitvector is now %#x\n", + gpuDynInst->statusBitVector); + + if (gpuDynInst->statusBitVector == VectorMask(0)) { + auto iter = gpuDynInst->memStatusVector.begin(); + auto end = gpuDynInst->memStatusVector.end(); + + while (iter != end) { + assert(iter->second.empty()); + ++iter; + } + + gpuDynInst->memStatusVector.clear(); + + if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) + gpuDynInst->statusVector.clear(); + + if (gpuDynInst->m_op == Enums::MO_LD || MO_A(gpuDynInst->m_op) + || MO_ANR(gpuDynInst->m_op)) { + assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy()); + + compute_unit->globalMemoryPipe.getGMLdRespFIFO() + .push(gpuDynInst); + } else { + assert(compute_unit->globalMemoryPipe.isGMStRespFIFOWrRdy()); + + compute_unit->globalMemoryPipe.getGMStRespFIFO() + .push(gpuDynInst); + } + + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n", + compute_unit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId); + + // after clearing the status vectors, + // see if there is a continuation to perform + // the continuation may generate more work for + // this memory request + if (gpuDynInst->useContinuation) { + assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), + gpuDynInst); + } + } + } + } else { + gpuDynInst->statusBitVector = VectorMask(0); + + if (gpuDynInst->useContinuation) { + assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), + gpuDynInst); + } + } + + delete pkt->senderState; + delete pkt->req; + delete pkt; +} + +ComputeUnit* +ComputeUnitParams::create() +{ + return new ComputeUnit(this); +} + +bool +ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) +{ + Addr line = pkt->req->getPaddr(); + + DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id, + pkt->req->getVaddr(), line); + + assert(pkt->senderState); + computeUnit->tlbCycles += curTick(); + + // pop off the TLB translation state + TheISA::GpuTLB::TranslationState *translation_state = + safe_cast(pkt->senderState); + + // no PageFaults are permitted for data accesses + if (!translation_state->tlbEntry->valid) { + DTLBPort::SenderState *sender_state = + safe_cast(translation_state->saved); + + Wavefront *w M5_VAR_USED = + computeUnit->wfList[sender_state->_gpuDynInst->simdId] + [sender_state->_gpuDynInst->wfSlotId]; + + DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId, + pkt->req->getVaddr()); + } + + assert(translation_state->tlbEntry->valid); + + // update the hitLevel distribution + int hit_level = translation_state->hitLevel; + computeUnit->hitsPerTLBLevel[hit_level]++; + + delete translation_state->tlbEntry; + assert(!translation_state->ports.size()); + pkt->senderState = translation_state->saved; + + // for prefetch pkt + BaseTLB::Mode TLB_mode = translation_state->tlbMode; + + delete translation_state; + + // use the original sender state to know how to close this transaction + DTLBPort::SenderState *sender_state = + safe_cast(pkt->senderState); + + GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; + int mp_index = sender_state->portIndex; + Addr vaddr = pkt->req->getVaddr(); + gpuDynInst->memStatusVector[line].push_back(mp_index); + gpuDynInst->tlbHitLevel[mp_index] = hit_level; + + MemCmd requestCmd; + + if (pkt->cmd == MemCmd::ReadResp) { + requestCmd = MemCmd::ReadReq; + } else if (pkt->cmd == MemCmd::WriteResp) { + requestCmd = MemCmd::WriteReq; + } else if (pkt->cmd == MemCmd::SwapResp) { + requestCmd = MemCmd::SwapReq; + } else { + panic("unsupported response to request conversion %s\n", + pkt->cmd.toString()); + } + + if (computeUnit->prefetchDepth) { + int simdId = gpuDynInst->simdId; + int wfSlotId = gpuDynInst->wfSlotId; + Addr last = 0; + + switch(computeUnit->prefetchType) { + case Enums::PF_CU: + last = computeUnit->lastVaddrCU[mp_index]; + break; + case Enums::PF_PHASE: + last = computeUnit->lastVaddrPhase[simdId][mp_index]; + break; + case Enums::PF_WF: + last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index]; + default: + break; + } + + DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n", + computeUnit->cu_id, simdId, wfSlotId, mp_index, last); + + int stride = last ? (roundDown(vaddr, TheISA::PageBytes) - + roundDown(last, TheISA::PageBytes)) >> TheISA::PageShift + : 0; + + DPRINTF(GPUPrefetch, "Stride is %d\n", stride); + + computeUnit->lastVaddrCU[mp_index] = vaddr; + computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr; + computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr; + + stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ? + computeUnit->prefetchStride: stride; + + DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr, + computeUnit->cu_id, simdId, wfSlotId, mp_index); + + DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr); + + // Prefetch Next few pages atomically + for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) { + DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride, + vaddr+stride*pf*TheISA::PageBytes); + + if (!stride) + break; + + Request *prefetch_req = new Request(0, vaddr + stride * pf * + TheISA::PageBytes, + sizeof(uint8_t), 0, + computeUnit->masterId(), + 0, 0, 0); + + PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd); + uint8_t foo = 0; + prefetch_pkt->dataStatic(&foo); + + // Because it's atomic operation, only need TLB translation state + prefetch_pkt->senderState = + new TheISA::GpuTLB::TranslationState(TLB_mode, + computeUnit->shader->gpuTc, + true); + + // Currently prefetches are zero-latency, hence the sendFunctional + sendFunctional(prefetch_pkt); + + /* safe_cast the senderState */ + TheISA::GpuTLB::TranslationState *tlb_state = + safe_cast( + prefetch_pkt->senderState); + + + delete tlb_state->tlbEntry; + delete tlb_state; + delete prefetch_pkt->req; + delete prefetch_pkt; + } + } + + // First we must convert the response cmd back to a request cmd so that + // the request can be sent through the cu's master port + PacketPtr new_pkt = new Packet(pkt->req, requestCmd); + new_pkt->dataStatic(pkt->getPtr()); + delete pkt->senderState; + delete pkt; + + // New SenderState for the memory access + new_pkt->senderState = + new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index, + nullptr); + + // translation is done. Schedule the mem_req_event at the appropriate + // cycle to send the timing memory request to ruby + ComputeUnit::DataPort::MemReqEvent *mem_req_event = + new ComputeUnit::DataPort::MemReqEvent(computeUnit->memPort[mp_index], + new_pkt); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n", + computeUnit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr()); + + computeUnit->schedule(mem_req_event, curTick() + + computeUnit->req_tick_latency); + + return true; +} + +const char* +ComputeUnit::DataPort::MemReqEvent::description() const +{ + return "ComputeUnit memory request event"; +} + +void +ComputeUnit::DataPort::MemReqEvent::process() +{ + SenderState *sender_state = safe_cast(pkt->senderState); + GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; + ComputeUnit *compute_unit M5_VAR_USED = dataPort->computeUnit; + + if (!(dataPort->sendTimingReq(pkt))) { + dataPort->retries.push_back(std::make_pair(pkt, gpuDynInst)); + + DPRINTF(GPUPort, + "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n", + compute_unit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, dataPort->index, + pkt->req->getPaddr()); + } else { + DPRINTF(GPUPort, + "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n", + compute_unit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, dataPort->index, + pkt->req->getPaddr()); + } +} + +/* + * The initial translation request could have been rejected, + * if queue is not Retry sending the translation + * request. sendRetry() is called from the peer port whenever + * a translation completes. + */ +void +ComputeUnit::DTLBPort::recvReqRetry() +{ + int len = retries.size(); + + DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n", + computeUnit->cu_id, len); + + assert(len > 0); + assert(isStalled()); + // recvReqRetry is an indication that the resource on which this + // port was stalling on is freed. So, remove the stall first + unstallPort(); + + for (int i = 0; i < len; ++i) { + PacketPtr pkt = retries.front(); + Addr vaddr M5_VAR_USED = pkt->req->getVaddr(); + DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr); + + if (!sendTimingReq(pkt)) { + // Stall port + stallPort(); + DPRINTF(GPUTLB, ": failed again\n"); + break; + } else { + DPRINTF(GPUTLB, ": successful\n"); + retries.pop_front(); + } + } +} + +bool +ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt) +{ + Addr line M5_VAR_USED = pkt->req->getPaddr(); + DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n", + computeUnit->cu_id, pkt->req->getVaddr(), line); + + assert(pkt->senderState); + + // pop off the TLB translation state + TheISA::GpuTLB::TranslationState *translation_state = + safe_cast(pkt->senderState); + + bool success = translation_state->tlbEntry->valid; + delete translation_state->tlbEntry; + assert(!translation_state->ports.size()); + pkt->senderState = translation_state->saved; + delete translation_state; + + // use the original sender state to know how to close this transaction + ITLBPort::SenderState *sender_state = + safe_cast(pkt->senderState); + + // get the wavefront associated with this translation request + Wavefront *wavefront = sender_state->wavefront; + delete pkt->senderState; + + if (success) { + // pkt is reused in fetch(), don't delete it here. However, we must + // reset the command to be a request so that it can be sent through + // the cu's master port + assert(pkt->cmd == MemCmd::ReadResp); + pkt->cmd = MemCmd::ReadReq; + + computeUnit->fetchStage.fetch(pkt, wavefront); + } else { + if (wavefront->dropFetch) { + assert(wavefront->instructionBuffer.empty()); + wavefront->dropFetch = false; + } + + wavefront->pendingFetch = 0; + } + + return true; +} + +/* + * The initial translation request could have been rejected, if + * queue is not empty. Retry sending the translation + * request. sendRetry() is called from the peer port whenever + * a translation completes. + */ +void +ComputeUnit::ITLBPort::recvReqRetry() +{ + + int len = retries.size(); + DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len); + + assert(len > 0); + assert(isStalled()); + + // recvReqRetry is an indication that the resource on which this + // port was stalling on is freed. So, remove the stall first + unstallPort(); + + for (int i = 0; i < len; ++i) { + PacketPtr pkt = retries.front(); + Addr vaddr M5_VAR_USED = pkt->req->getVaddr(); + DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr); + + if (!sendTimingReq(pkt)) { + stallPort(); // Stall port + DPRINTF(GPUTLB, ": failed again\n"); + break; + } else { + DPRINTF(GPUTLB, ": successful\n"); + retries.pop_front(); + } + } +} + +void +ComputeUnit::regStats() +{ + tlbCycles + .name(name() + ".tlb_cycles") + .desc("total number of cycles for all uncoalesced requests") + ; + + tlbRequests + .name(name() + ".tlb_requests") + .desc("number of uncoalesced requests") + ; + + tlbLatency + .name(name() + ".avg_translation_latency") + .desc("Avg. translation latency for data translations") + ; + + tlbLatency = tlbCycles / tlbRequests; + + hitsPerTLBLevel + .init(4) + .name(name() + ".TLB_hits_distribution") + .desc("TLB hits distribution (0 for page table, x for Lx-TLB") + ; + + // fixed number of TLB levels + for (int i = 0; i < 4; ++i) { + if (!i) + hitsPerTLBLevel.subname(i,"page_table"); + else + hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i)); + } + + execRateDist + .init(0, 10, 2) + .name(name() + ".inst_exec_rate") + .desc("Instruction Execution Rate: Number of executed vector " + "instructions per cycle") + ; + + ldsBankConflictDist + .init(0, VSZ, 2) + .name(name() + ".lds_bank_conflicts") + .desc("Number of bank conflicts per LDS memory packet") + ; + + ldsBankAccesses + .name(name() + ".lds_bank_access_cnt") + .desc("Total number of LDS bank accesses") + ; + + pageDivergenceDist + // A wavefront can touch 1 to VSZ pages per memory instruction. + // The number of pages per bin can be configured (here it's 4). + .init(1, VSZ, 4) + .name(name() + ".page_divergence_dist") + .desc("pages touched per wf (over all mem. instr.)") + ; + + controlFlowDivergenceDist + .init(1, VSZ, 4) + .name(name() + ".warp_execution_dist") + .desc("number of lanes active per instruction (oval all instructions)") + ; + + activeLanesPerGMemInstrDist + .init(1, VSZ, 4) + .name(name() + ".gmem_lanes_execution_dist") + .desc("number of active lanes per global memory instruction") + ; + + activeLanesPerLMemInstrDist + .init(1, VSZ, 4) + .name(name() + ".lmem_lanes_execution_dist") + .desc("number of active lanes per local memory instruction") + ; + + numInstrExecuted + .name(name() + ".num_instr_executed") + .desc("number of instructions executed") + ; + + numVecOpsExecuted + .name(name() + ".num_vec_ops_executed") + .desc("number of vec ops executed (e.g. VSZ/inst)") + ; + + totalCycles + .name(name() + ".num_total_cycles") + .desc("number of cycles the CU ran for") + ; + + ipc + .name(name() + ".ipc") + .desc("Instructions per cycle (this CU only)") + ; + + vpc + .name(name() + ".vpc") + .desc("Vector Operations per cycle (this CU only)") + ; + + numALUInstsExecuted + .name(name() + ".num_alu_insts_executed") + .desc("Number of dynamic non-GM memory insts executed") + ; + + wgBlockedDueLdsAllocation + .name(name() + ".wg_blocked_due_lds_alloc") + .desc("Workgroup blocked due to LDS capacity") + ; + + ipc = numInstrExecuted / totalCycles; + vpc = numVecOpsExecuted / totalCycles; + + numTimesWgBlockedDueVgprAlloc + .name(name() + ".times_wg_blocked_due_vgpr_alloc") + .desc("Number of times WGs are blocked due to VGPR allocation per SIMD") + ; + + dynamicGMemInstrCnt + .name(name() + ".global_mem_instr_cnt") + .desc("dynamic global memory instructions count") + ; + + dynamicLMemInstrCnt + .name(name() + ".local_mem_instr_cnt") + .desc("dynamic local memory intruction count") + ; + + numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt - + dynamicLMemInstrCnt; + + completedWfs + .name(name() + ".num_completed_wfs") + .desc("number of completed wavefronts") + ; + + numCASOps + .name(name() + ".num_CAS_ops") + .desc("number of compare and swap operations") + ; + + numFailedCASOps + .name(name() + ".num_failed_CAS_ops") + .desc("number of compare and swap operations that failed") + ; + + // register stats of pipeline stages + fetchStage.regStats(); + scoreboardCheckStage.regStats(); + scheduleStage.regStats(); + execStage.regStats(); + + // register stats of memory pipeline + globalMemoryPipe.regStats(); + localMemoryPipe.regStats(); +} + +void +ComputeUnit::updatePageDivergenceDist(Addr addr) +{ + Addr virt_page_addr = roundDown(addr, TheISA::PageBytes); + + if (!pagesTouched.count(virt_page_addr)) + pagesTouched[virt_page_addr] = 1; + else + pagesTouched[virt_page_addr]++; +} + +void +ComputeUnit::CUExitCallback::process() +{ + if (computeUnit->countPages) { + std::ostream *page_stat_file = + simout.create(computeUnit->name().c_str()); + + *page_stat_file << "page, wavefront accesses, workitem accesses" << + std::endl; + + for (auto iter : computeUnit->pageAccesses) { + *page_stat_file << std::hex << iter.first << ","; + *page_stat_file << std::dec << iter.second.first << ","; + *page_stat_file << std::dec << iter.second.second << std::endl; + } + } + } + +bool +ComputeUnit::isDone() const +{ + for (int i = 0; i < numSIMDs; ++i) { + if (!isSimdDone(i)) { + return false; + } + } + + bool glbMemBusRdy = true; + for (int j = 0; j < numGlbMemUnits; ++j) { + glbMemBusRdy &= vrfToGlobalMemPipeBus[j].rdy(); + } + bool locMemBusRdy = true; + for (int j = 0; j < numLocMemUnits; ++j) { + locMemBusRdy &= vrfToLocalMemPipeBus[j].rdy(); + } + + if (!globalMemoryPipe.isGMLdRespFIFOWrRdy() || + !globalMemoryPipe.isGMStRespFIFOWrRdy() || + !globalMemoryPipe.isGMReqFIFOWrRdy() || !localMemoryPipe.isLMReqFIFOWrRdy() + || !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() || + !glbMemToVrfBus.rdy() || !locMemBusRdy || !glbMemBusRdy) { + return false; + } + + return true; +} + +int32_t +ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const +{ + return lds.getRefCounter(dispatchId, wgId); +} + +bool +ComputeUnit::isSimdDone(uint32_t simdId) const +{ + assert(simdId < numSIMDs); + + for (int i=0; i < numGlbMemUnits; ++i) { + if (!vrfToGlobalMemPipeBus[i].rdy()) + return false; + } + for (int i=0; i < numLocMemUnits; ++i) { + if (!vrfToLocalMemPipeBus[i].rdy()) + return false; + } + if (!aluPipe[simdId].rdy()) { + return false; + } + + for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){ + if (wfList[simdId][i_wf]->status != Wavefront::S_STOPPED) { + return false; + } + } + + return true; +} + +/** + * send a general request to the LDS + * make sure to look at the return value here as your request might be + * NACK'd and returning false means that you have to have some backup plan + */ +bool +ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst) +{ + // this is just a request to carry the GPUDynInstPtr + // back and forth + Request *newRequest = new Request(); + newRequest->setPaddr(0x0); + + // ReadReq is not evaluted by the LDS but the Packet ctor requires this + PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq); + + // This is the SenderState needed upon return + newPacket->senderState = new LDSPort::SenderState(gpuDynInst); + + return ldsPort->sendTimingReq(newPacket); +} + +/** + * get the result of packets sent to the LDS when they return + */ +bool +ComputeUnit::LDSPort::recvTimingResp(PacketPtr packet) +{ + const ComputeUnit::LDSPort::SenderState *senderState = + dynamic_cast(packet->senderState); + + fatal_if(!senderState, "did not get the right sort of sender state"); + + GPUDynInstPtr gpuDynInst = senderState->getMemInst(); + + delete packet->senderState; + delete packet->req; + delete packet; + + computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst); + return true; +} + +/** + * attempt to send this packet, either the port is already stalled, the request + * is nack'd and must stall or the request goes through + * when a request cannot be sent, add it to the retries queue + */ +bool +ComputeUnit::LDSPort::sendTimingReq(PacketPtr pkt) +{ + ComputeUnit::LDSPort::SenderState *sender_state = + dynamic_cast(pkt->senderState); + fatal_if(!sender_state, "packet without a valid sender state"); + + GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst(); + + if (isStalled()) { + fatal_if(retries.empty(), "must have retries waiting to be stalled"); + + retries.push(pkt); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n", + computeUnit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId); + return false; + } else if (!MasterPort::sendTimingReq(pkt)) { + // need to stall the LDS port until a recvReqRetry() is received + // this indicates that there is more space + stallPort(); + retries.push(pkt); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n", + computeUnit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, pkt->req->getPaddr()); + return false; + } else { + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n", + computeUnit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, pkt->req->getPaddr()); + return true; + } +} + +/** + * the bus is telling the port that there is now space so retrying stalled + * requests should work now + * this allows the port to have a request be nack'd and then have the receiver + * say when there is space, rather than simply retrying the send every cycle + */ +void +ComputeUnit::LDSPort::recvReqRetry() +{ + auto queueSize = retries.size(); + + DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n", + computeUnit->cu_id, queueSize); + + fatal_if(queueSize < 1, + "why was there a recvReqRetry() with no pending reqs?"); + fatal_if(!isStalled(), + "recvReqRetry() happened when the port was not stalled"); + + unstallPort(); + + while (!retries.empty()) { + PacketPtr packet = retries.front(); + + DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id); + + if (!MasterPort::sendTimingReq(packet)) { + // Stall port + stallPort(); + DPRINTF(GPUPort, ": LDS send failed again\n"); + break; + } else { + DPRINTF(GPUTLB, ": LDS send successful\n"); + retries.pop(); + } + } +} diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh new file mode 100644 index 000000000..f47c27a0a --- /dev/null +++ b/src/gpu-compute/compute_unit.hh @@ -0,0 +1,767 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Anthony Gutierrez + */ + +#ifndef __COMPUTE_UNIT_HH__ +#define __COMPUTE_UNIT_HH__ + +#include +#include +#include +#include + +#include "base/callback.hh" +#include "base/statistics.hh" +#include "base/types.hh" +#include "enums/PrefetchType.hh" +#include "gpu-compute/exec_stage.hh" +#include "gpu-compute/fetch_stage.hh" +#include "gpu-compute/global_memory_pipeline.hh" +#include "gpu-compute/local_memory_pipeline.hh" +#include "gpu-compute/qstruct.hh" +#include "gpu-compute/schedule_stage.hh" +#include "gpu-compute/scoreboard_check_stage.hh" +#include "mem/mem_object.hh" +#include "mem/port.hh" + +static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1; +static const int MAX_WIDTH_FOR_MEM_INST = 32; + +class NDRange; +class Shader; +class VectorRegisterFile; + +struct ComputeUnitParams; + +enum EXEC_POLICY +{ + OLDEST = 0, + RR +}; + +// List of execution units +enum EXEC_UNIT +{ + SIMD0 = 0, + SIMD1, + SIMD2, + SIMD3, + GLBMEM_PIPE, + LDSMEM_PIPE, + NUM_UNITS +}; + +enum TLB_CACHE +{ + TLB_MISS_CACHE_MISS = 0, + TLB_MISS_CACHE_HIT, + TLB_HIT_CACHE_MISS, + TLB_HIT_CACHE_HIT +}; + +class ComputeUnit : public MemObject +{ + public: + FetchStage fetchStage; + ScoreboardCheckStage scoreboardCheckStage; + ScheduleStage scheduleStage; + ExecStage execStage; + GlobalMemPipeline globalMemoryPipe; + LocalMemPipeline localMemoryPipe; + + // Buffers used to communicate between various pipeline stages + + // List of waves which are ready to be scheduled. + // Each execution resource has a ready list. readyList is + // used to communicate between scoreboardCheck stage and + // schedule stage + // TODO: make enum to index readyList + std::vector> readyList; + + // Stores the status of waves. A READY implies the + // wave is ready to be scheduled this cycle and + // is already present in the readyList. waveStatusList is + // used to communicate between scoreboardCheck stage and + // schedule stage + // TODO: convert std::pair to a class to increase readability + std::vector>> waveStatusList; + + // List of waves which will be dispatched to + // each execution resource. A FILLED implies + // dispatch list is non-empty and + // execution unit has something to execute + // this cycle. Currently, the dispatch list of + // an execution resource can hold only one wave because + // an execution resource can execute only one wave in a cycle. + // dispatchList is used to communicate between schedule + // and exec stage + // TODO: convert std::pair to a class to increase readability + std::vector> dispatchList; + + int rrNextMemID; // used by RR WF exec policy to cycle through WF's + int rrNextALUWp; + typedef ComputeUnitParams Params; + std::vector> wfList; + int cu_id; + + // array of vector register files, one per SIMD + std::vector vrf; + // Number of vector ALU units (SIMDs) in CU + int numSIMDs; + // number of pipe stages for bypassing data to next dependent single + // precision vector instruction inside the vector ALU pipeline + int spBypassPipeLength; + // number of pipe stages for bypassing data to next dependent double + // precision vector instruction inside the vector ALU pipeline + int dpBypassPipeLength; + // number of cycles per issue period + int issuePeriod; + + // Number of global and local memory execution resources in CU + int numGlbMemUnits; + int numLocMemUnits; + // tracks the last cycle a vector instruction was executed on a SIMD + std::vector lastExecCycle; + + // true if we allow a separate TLB per lane + bool perLaneTLB; + // if 0, TLB prefetching is off. + int prefetchDepth; + // if fixed-stride prefetching, this is the stride. + int prefetchStride; + + class LastVaddrWave + { + public: + Addr vaddrs[VSZ]; + Addr& operator[](int idx) { + return vaddrs[idx]; + } + + LastVaddrWave() { + for (int i = 0; i < VSZ; ++i) + vaddrs[i] = 0; + } + }; + + LastVaddrWave lastVaddrCU; + std::vector lastVaddrPhase; + std::vector>> lastVaddrWF; + Enums::PrefetchType prefetchType; + EXEC_POLICY exec_policy; + + bool xact_cas_mode; + bool debugSegFault; + bool functionalTLB; + bool localMemBarrier; + + /* + * for Counting page accesses + * + * cuExitCallback inherits from Callback. When you register a callback + * function as an exit callback, it will get added to an exit callback + * queue, such that on simulation exit, all callbacks in the callback + * queue will have their process() function called. + */ + bool countPages; + + Shader *shader; + uint32_t barrier_id; + // vector of Vector ALU (MACC) pipelines + std::vector aluPipe; + // minimum issue period per SIMD unit (in cycles) + std::vector wfWait; + + // Resource control for Vector Register File->Global Memory pipe buses + std::vector vrfToGlobalMemPipeBus; + // Resource control for Vector Register File->Local Memory pipe buses + std::vector vrfToLocalMemPipeBus; + int nextGlbMemBus; + int nextLocMemBus; + // Resource control for global memory to VRF data/address bus + WaitClass glbMemToVrfBus; + // Resource control for local memory to VRF data/address bus + WaitClass locMemToVrfBus; + + uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes + uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes + uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store + uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load + + Tick req_tick_latency; + Tick resp_tick_latency; + + // number of vector registers being reserved for each SIMD unit + std::vector vectorRegsReserved; + // number of vector registers per SIMD unit + uint32_t numVecRegsPerSimd; + // Support for scheduling VGPR status update events + std::vector > regIdxVec; + std::vector timestampVec; + std::vector statusVec; + + void + registerEvent(uint32_t simdId, + uint32_t regIdx, + uint32_t operandSize, + uint64_t when, + uint8_t newStatus) { + regIdxVec.push_back(std::make_pair(simdId, regIdx)); + timestampVec.push_back(when); + statusVec.push_back(newStatus); + if (operandSize > 4) { + regIdxVec.push_back(std::make_pair(simdId, + ((regIdx + 1) % + numVecRegsPerSimd))); + timestampVec.push_back(when); + statusVec.push_back(newStatus); + } + } + + void updateEvents(); + + // this hash map will keep track of page divergence + // per memory instruction per wavefront. The hash map + // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc. + std::map pagesTouched; + + ComputeUnit(const Params *p); + ~ComputeUnit(); + int spBypassLength() { return spBypassPipeLength; }; + int dpBypassLength() { return dpBypassPipeLength; }; + int storeBusLength() { return numCyclesPerStoreTransfer; }; + int loadBusLength() { return numCyclesPerLoadTransfer; }; + int wfSize() const { return wavefrontSize; }; + + void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); + void exec(); + void initiateFetch(Wavefront *wavefront); + void fetch(PacketPtr pkt, Wavefront *wavefront); + void FillKernelState(Wavefront *w, NDRange *ndr); + + void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], + int trueWgSizeTotal); + + void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt, + int trueWgSize[], int trueWgSizeTotal, + LdsChunk *ldsChunk, uint64_t origSpillMemStart); + + void StartWorkgroup(NDRange *ndr); + int ReadyWorkgroup(NDRange *ndr); + + bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; } + bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; } + bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; } + int GlbMemUnitId() { return GLBMEM_PIPE; } + int ShrMemUnitId() { return LDSMEM_PIPE; } + int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; } + int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; } + /* This function cycles through all the wavefronts in all the phases to see + * if all of the wavefronts which should be associated with one barrier + * (denoted with _barrier_id), are all at the same barrier in the program + * (denoted by bcnt). When the number at the barrier matches bslots, then + * return true. + */ + int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots); + bool cedeSIMD(int simdId, int wfSlotId); + + template void doSmReturn(GPUDynInstPtr gpuDynInst); + virtual void init(); + void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); + void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); + void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, + bool kernelLaunch=true, + RequestPtr req=nullptr); + void handleMemPacket(PacketPtr pkt, int memport_index); + bool processTimingPacket(PacketPtr pkt); + void processFetchReturn(PacketPtr pkt); + void updatePageDivergenceDist(Addr addr); + + MasterID masterId() { return _masterId; } + + bool isDone() const; + bool isSimdDone(uint32_t) const; + + protected: + MasterID _masterId; + + LdsState &lds; + + public: + // the following stats compute the avg. TLB accesslatency per + // uncoalesced request (only for data) + Stats::Scalar tlbRequests; + Stats::Scalar tlbCycles; + Stats::Formula tlbLatency; + // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table. + Stats::Vector hitsPerTLBLevel; + + Stats::Scalar ldsBankAccesses; + Stats::Distribution ldsBankConflictDist; + + // over all memory instructions executed over all wavefronts + // how many touched 0-4 pages, 4-8, ..., 60-64 pages + Stats::Distribution pageDivergenceDist; + Stats::Scalar dynamicGMemInstrCnt; + Stats::Scalar dynamicLMemInstrCnt; + + Stats::Scalar wgBlockedDueLdsAllocation; + // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active + // when the instruction is committed, this number is still incremented by 1 + Stats::Scalar numInstrExecuted; + // Number of cycles among successive instruction executions across all + // wavefronts of the same CU + Stats::Distribution execRateDist; + // number of individual vector operations executed + Stats::Scalar numVecOpsExecuted; + // Total cycles that something is running on the GPU + Stats::Scalar totalCycles; + Stats::Formula vpc; // vector ops per cycle + Stats::Formula ipc; // vector instructions per cycle + Stats::Distribution controlFlowDivergenceDist; + Stats::Distribution activeLanesPerGMemInstrDist; + Stats::Distribution activeLanesPerLMemInstrDist; + // number of vector ALU instructions received + Stats::Formula numALUInstsExecuted; + // number of times a WG can not start due to lack of free VGPRs in SIMDs + Stats::Scalar numTimesWgBlockedDueVgprAlloc; + Stats::Scalar numCASOps; + Stats::Scalar numFailedCASOps; + Stats::Scalar completedWfs; + // flag per vector SIMD unit that is set when there is at least one + // WV that has a vector ALU instruction as the oldest in its + // Instruction Buffer: Defined in the Scoreboard stage, consumed + // by the Execute stage. + std::vector vectorAluInstAvail; + // number of available (oldest) LDS instructions that could have + // been issued to the LDS at a specific issue slot + int shrMemInstAvail; + // number of available Global memory instructions that could have + // been issued to TCP at a specific issue slot + int glbMemInstAvail; + + void + regStats(); + + LdsState & + getLds() const + { + return lds; + } + + int32_t + getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const; + + bool + sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result)); + + typedef std::unordered_map> pageDataStruct; + pageDataStruct pageAccesses; + + class CUExitCallback : public Callback + { + private: + ComputeUnit *computeUnit; + + public: + virtual ~CUExitCallback() { } + + CUExitCallback(ComputeUnit *_cu) + { + computeUnit = _cu; + } + + virtual void + process(); + }; + + CUExitCallback *cuExitCallback; + + /** Data access Port **/ + class DataPort : public MasterPort + { + public: + DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index) + : MasterPort(_name, _cu), computeUnit(_cu), + index(_index) { } + + bool snoopRangeSent; + + struct SenderState : public Packet::SenderState + { + GPUDynInstPtr _gpuDynInst; + int port_index; + Packet::SenderState *saved; + + SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, + Packet::SenderState *sender_state=nullptr) + : _gpuDynInst(gpuDynInst), + port_index(_port_index), + saved(sender_state) { } + }; + + class MemReqEvent : public Event + { + private: + DataPort *dataPort; + PacketPtr pkt; + + public: + MemReqEvent(DataPort *_data_port, PacketPtr _pkt) + : Event(), dataPort(_data_port), pkt(_pkt) + { + setFlags(Event::AutoDelete); + } + + void process(); + const char *description() const; + }; + + class MemRespEvent : public Event + { + private: + DataPort *dataPort; + PacketPtr pkt; + + public: + MemRespEvent(DataPort *_data_port, PacketPtr _pkt) + : Event(), dataPort(_data_port), pkt(_pkt) + { + setFlags(Event::AutoDelete); + } + + void process(); + const char *description() const; + }; + + std::deque> retries; + + protected: + ComputeUnit *computeUnit; + int index; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + + virtual void + getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) + { + resp.clear(); + snoop = true; + } + + }; + + // Instruction cache access port + class SQCPort : public MasterPort + { + public: + SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index) + : MasterPort(_name, _cu), computeUnit(_cu), + index(_index) { } + + bool snoopRangeSent; + + struct SenderState : public Packet::SenderState + { + Wavefront *wavefront; + Packet::SenderState *saved; + + SenderState(Wavefront *_wavefront, Packet::SenderState + *sender_state=nullptr) + : wavefront(_wavefront), saved(sender_state) { } + }; + + std::deque> retries; + + protected: + ComputeUnit *computeUnit; + int index; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + + virtual void + getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) + { + resp.clear(); + snoop = true; + } + }; + + /** Data TLB port **/ + class DTLBPort : public MasterPort + { + public: + DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index) + : MasterPort(_name, _cu), computeUnit(_cu), + index(_index), stalled(false) + { } + + bool isStalled() { return stalled; } + void stallPort() { stalled = true; } + void unstallPort() { stalled = false; } + + /** + * here we queue all the translation requests that were + * not successfully sent. + */ + std::deque retries; + + /** SenderState is information carried along with the packet + * throughout the TLB hierarchy + */ + struct SenderState: public Packet::SenderState + { + // the memInst that this is associated with + GPUDynInstPtr _gpuDynInst; + + // the lane in the memInst this is associated with, so we send + // the memory request down the right port + int portIndex; + + // constructor used for packets involved in timing accesses + SenderState(GPUDynInstPtr gpuDynInst, PortID port_index) + : _gpuDynInst(gpuDynInst), portIndex(port_index) { } + + }; + + protected: + ComputeUnit *computeUnit; + int index; + bool stalled; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + }; + + class ITLBPort : public MasterPort + { + public: + ITLBPort(const std::string &_name, ComputeUnit *_cu) + : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { } + + + bool isStalled() { return stalled; } + void stallPort() { stalled = true; } + void unstallPort() { stalled = false; } + + /** + * here we queue all the translation requests that were + * not successfully sent. + */ + std::deque retries; + + /** SenderState is information carried along with the packet + * throughout the TLB hierarchy + */ + struct SenderState: public Packet::SenderState + { + // The wavefront associated with this request + Wavefront *wavefront; + + SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { } + }; + + protected: + ComputeUnit *computeUnit; + bool stalled; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + }; + + /** + * the port intended to communicate between the CU and its LDS + */ + class LDSPort : public MasterPort + { + public: + LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id) + : MasterPort(_name, _cu, _id), computeUnit(_cu) + { + } + + bool isStalled() const { return stalled; } + void stallPort() { stalled = true; } + void unstallPort() { stalled = false; } + + /** + * here we queue all the requests that were + * not successfully sent. + */ + std::queue retries; + + /** + * SenderState is information carried along with the packet, esp. the + * GPUDynInstPtr + */ + class SenderState: public Packet::SenderState + { + protected: + // The actual read/write/atomic request that goes with this command + GPUDynInstPtr _gpuDynInst = nullptr; + + public: + SenderState(GPUDynInstPtr gpuDynInst): + _gpuDynInst(gpuDynInst) + { + } + + GPUDynInstPtr + getMemInst() const + { + return _gpuDynInst; + } + }; + + virtual bool + sendTimingReq(PacketPtr pkt); + + protected: + + bool stalled = false; ///< whether or not it is stalled + + ComputeUnit *computeUnit; + + virtual bool + recvTimingResp(PacketPtr pkt); + + virtual Tick + recvAtomic(PacketPtr pkt) { return 0; } + + virtual void + recvFunctional(PacketPtr pkt) + { + } + + virtual void + recvRangeChange() + { + } + + virtual void + recvReqRetry(); + }; + + /** The port to access the Local Data Store + * Can be connected to a LDS object + */ + LDSPort *ldsPort = nullptr; + + LDSPort * + getLdsPort() const + { + return ldsPort; + } + + /** The memory port for SIMD data accesses. + * Can be connected to PhysMem for Ruby for timing simulations + */ + std::vector memPort; + // port to the TLB hierarchy (i.e., the L1 TLB) + std::vector tlbPort; + // port to the SQC (i.e. the I-cache) + SQCPort *sqcPort; + // port to the SQC TLB (there's a separate TLB for each I-cache) + ITLBPort *sqcTLBPort; + + virtual BaseMasterPort& + getMasterPort(const std::string &if_name, PortID idx) + { + if (if_name == "memory_port") { + memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx), + this, idx); + return *memPort[idx]; + } else if (if_name == "translation_port") { + tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx), + this, idx); + return *tlbPort[idx]; + } else if (if_name == "sqc_port") { + sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx), + this, idx); + return *sqcPort; + } else if (if_name == "sqc_tlb_port") { + sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this); + return *sqcTLBPort; + } else if (if_name == "ldsPort") { + if (ldsPort) { + fatal("an LDS port was already allocated"); + } + ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx); + return *ldsPort; + } else { + panic("incorrect port name"); + } + } + + // xact_cas_load() + class waveIdentifier + { + public: + waveIdentifier() { } + waveIdentifier(int _simdId, int _wfSlotId) + : simdId(_simdId), wfSlotId(_wfSlotId) { } + + int simdId; + int wfSlotId; + }; + + class waveQueue + { + public: + std::list waveIDQueue; + }; + std::map xactCasLoadMap; + + uint64_t getAndIncSeqNum() { return globalSeqNum++; } + + private: + uint64_t globalSeqNum; + int wavefrontSize; +}; + +#endif // __COMPUTE_UNIT_HH__ diff --git a/src/gpu-compute/condition_register_state.cc b/src/gpu-compute/condition_register_state.cc new file mode 100644 index 000000000..f3f2d2927 --- /dev/null +++ b/src/gpu-compute/condition_register_state.cc @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#include "gpu-compute/condition_register_state.hh" + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" + +ConditionRegisterState::ConditionRegisterState() +{ + computeUnit = nullptr; + c_reg.clear(); + busy.clear(); +} + +void +ConditionRegisterState::setParent(ComputeUnit *_computeUnit) +{ + computeUnit = _computeUnit; + _name = computeUnit->name() + ".CondRegState"; +} + +void +ConditionRegisterState::init(uint32_t _size) +{ + c_reg.resize(_size); + busy.resize(_size, 0); +} + +void +ConditionRegisterState::exec(GPUStaticInst *ii, Wavefront *w) +{ + // iterate over all operands + for (auto i = 0; i < ii->getNumOperands(); ++i) { + // is this a condition register destination operand? + if (ii->isCondRegister(i) && ii->isDstOperand(i)) { + // mark the register as busy + markReg(ii->getRegisterIndex(i), 1); + uint32_t pipeLen = w->computeUnit->spBypassLength(); + + // schedule an event for marking the register as ready + w->computeUnit-> + registerEvent(w->simdId, ii->getRegisterIndex(i), + ii->getOperandSize(i), + w->computeUnit->shader->tick_cnt + + w->computeUnit->shader->ticks(pipeLen), 0); + } + } +} diff --git a/src/gpu-compute/condition_register_state.hh b/src/gpu-compute/condition_register_state.hh new file mode 100644 index 000000000..139874a66 --- /dev/null +++ b/src/gpu-compute/condition_register_state.hh @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#ifndef __CONDITION_REGISTER_STATE_HH__ +#define __CONDITION_REGISTER_STATE_HH__ + +#include +#include + +#include "gpu-compute/misc.hh" + +class ComputeUnit; +class GPUStaticInst; +class Shader; +class Wavefront; + +// Condition Register State (used only when executing HSAIL) +class ConditionRegisterState +{ + public: + ConditionRegisterState(); + void init(uint32_t _size); + const std::string name() const { return _name; } + void setParent(ComputeUnit *_computeUnit); + void regStats() { } + + template + T + read(int regIdx, int threadId) + { + bool tmp = c_reg[regIdx][threadId]; + T *p0 = (T*)(&tmp); + + return *p0; + } + + template + void + write(int regIdx, int threadId, T value) + { + c_reg[regIdx][threadId] = (bool)(value & 0x01); + } + + void + markReg(int regIdx, uint8_t value) + { + busy.at(regIdx) = value; + } + + uint8_t + regBusy(int idx) + { + uint8_t status = busy.at(idx); + return status; + } + + int numRegs() { return c_reg.size(); } + void exec(GPUStaticInst *ii, Wavefront *w); + + private: + ComputeUnit* computeUnit; + std::string _name; + // Condition Register state + std::vector c_reg; + // flag indicating if a register is busy + std::vector busy; +}; + +#endif diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc new file mode 100644 index 000000000..55e4be72a --- /dev/null +++ b/src/gpu-compute/dispatcher.cc @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Brad Beckmann, Marc Orr + */ + + +#include "gpu-compute/dispatcher.hh" + +#include "cpu/base.hh" +#include "debug/GPUDisp.hh" +#include "gpu-compute/cl_driver.hh" +#include "gpu-compute/cl_event.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" +#include "mem/packet_access.hh" + +GpuDispatcher *GpuDispatcher::instance = nullptr; + +GpuDispatcher::GpuDispatcher(const Params *p) + : DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")), + pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency), + dispatchCount(0), dispatchActive(false), cpu(p->cpu), + shader(p->shader_pointer), driver(p->cl_driver), tickEvent(this) +{ + shader->handshake(this); + driver->handshake(this); + + ndRange.wg_disp_rem = false; + ndRange.globalWgId = 0; + + schedule(&tickEvent, 0); + + // translation port for the dispatcher + tlbPort = new TLBPort(csprintf("%s-port%d", name()), this); + + num_kernelLaunched + .name(name() + ".num_kernel_launched") + .desc("number of kernel launched") + ; +} + +GpuDispatcher *GpuDispatcherParams::create() +{ + GpuDispatcher *dispatcher = new GpuDispatcher(this); + GpuDispatcher::setInstance(dispatcher); + + return GpuDispatcher::getInstance(); +} + +void +GpuDispatcher::serialize(CheckpointOut &cp) const +{ + Tick event_tick = 0; + + if (ndRange.wg_disp_rem) + fatal("Checkpointing not supported during active workgroup execution"); + + if (tickEvent.scheduled()) + event_tick = tickEvent.when(); + + SERIALIZE_SCALAR(event_tick); + +} + +void +GpuDispatcher::unserialize(CheckpointIn &cp) +{ + Tick event_tick; + + if (tickEvent.scheduled()) + deschedule(&tickEvent); + + UNSERIALIZE_SCALAR(event_tick); + + if (event_tick) + schedule(&tickEvent, event_tick); +} + +AddrRangeList +GpuDispatcher::getAddrRanges() const +{ + AddrRangeList ranges; + + DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n", + pioAddr, pioSize); + + ranges.push_back(RangeSize(pioAddr, pioSize)); + + return ranges; +} + +Tick +GpuDispatcher::read(PacketPtr pkt) +{ + assert(pkt->getAddr() >= pioAddr); + assert(pkt->getAddr() < pioAddr + pioSize); + + int offset = pkt->getAddr() - pioAddr; + pkt->allocate(); + + DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize()); + + if (offset < 8) { + assert(!offset); + assert(pkt->getSize() == 8); + + uint64_t retval = dispatchActive; + pkt->set(retval); + } else { + offset -= 8; + assert(offset + pkt->getSize() < sizeof(HsaQueueEntry)); + char *curTaskPtr = (char*)&curTask; + + memcpy(pkt->getPtr(), curTaskPtr + offset, pkt->getSize()); + } + + pkt->makeAtomicResponse(); + + return pioDelay; +} + +Tick +GpuDispatcher::write(PacketPtr pkt) +{ + assert(pkt->getAddr() >= pioAddr); + assert(pkt->getAddr() < pioAddr + pioSize); + + int offset = pkt->getAddr() - pioAddr; + +#if TRACING_ON + uint64_t data_val = 0; + + switch (pkt->getSize()) { + case 1: + data_val = pkt->get(); + break; + case 2: + data_val = pkt->get(); + break; + case 4: + data_val = pkt->get(); + break; + case 8: + data_val = pkt->get(); + break; + default: + DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize()); + } + + DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val, + pkt->getSize()); +#endif + if (!offset) { + static int nextId = 0; + + // The depends field of the qstruct, which was previously unused, is + // used to communicate with simulated application. + if (curTask.depends) { + HostState hs; + shader->ReadMem((uint64_t)(curTask.depends), &hs, + sizeof(HostState), 0); + + // update event start time (in nano-seconds) + uint64_t start = curTick() / 1000; + + shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start), + &start, sizeof(uint64_t), 0); + } + + // launch kernel + ++num_kernelLaunched; + + NDRange *ndr = &(ndRangeMap[nextId]); + // copy dispatch info + ndr->q = curTask; + + // update the numDispTask polled by the runtime + accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1); + + ndr->numWgTotal = 1; + + for (int i = 0; i < 3; ++i) { + ndr->wgId[i] = 0; + ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]); + ndr->numWgTotal *= ndr->numWg[i]; + } + + ndr->numWgCompleted = 0; + ndr->globalWgId = 0; + ndr->wg_disp_rem = true; + ndr->execDone = false; + ndr->addrToNotify = (volatile bool*)curTask.addrToNotify; + ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft; + ndr->dispatchId = nextId; + ndr->curTid = pkt->req->threadId(); + DPRINTF(GPUDisp, "launching kernel %d\n",nextId); + execIds.push(nextId); + ++nextId; + + dispatchActive = true; + + if (!tickEvent.scheduled()) { + schedule(&tickEvent, curTick() + shader->ticks(1)); + } + } else { + // populate current task struct + // first 64 bits are launch reg + offset -= 8; + assert(offset < sizeof(HsaQueueEntry)); + char *curTaskPtr = (char*)&curTask; + memcpy(curTaskPtr + offset, pkt->getPtr(), pkt->getSize()); + } + + pkt->makeAtomicResponse(); + + return pioDelay; +} + + +BaseMasterPort& +GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx) +{ + if (if_name == "translation_port") { + return *tlbPort; + } + + return DmaDevice::getMasterPort(if_name, idx); +} + +void +GpuDispatcher::exec() +{ + int fail_count = 0; + + // There are potentially multiple outstanding kernel launches. + // It is possible that the workgroups in a different kernel + // can fit on the GPU even if another kernel's workgroups cannot + DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size()); + + while (execIds.size() > fail_count) { + int execId = execIds.front(); + + while (ndRangeMap[execId].wg_disp_rem) { + //update the thread context + shader->updateThreadContext(ndRangeMap[execId].curTid); + + // attempt to dispatch_workgroup + if (!shader->dispatch_workgroups(&ndRangeMap[execId])) { + // if we failed try the next kernel, + // it may have smaller workgroups. + // put it on the queue to rety latter + DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId); + execIds.push(execId); + ++fail_count; + break; + } + } + // let's try the next kernel_id + execIds.pop(); + } + + DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size()); + + if (doneIds.size() && cpu) { + shader->hostWakeUp(cpu); + } + + while (doneIds.size()) { + // wakeup the CPU if any Kernels completed this cycle + DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front()); + doneIds.pop(); + } +} + +void +GpuDispatcher::notifyWgCompl(Wavefront *w) +{ + int kern_id = w->kern_id; + DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id); + assert(ndRangeMap[kern_id].dispatchId == kern_id); + ndRangeMap[kern_id].numWgCompleted++; + + if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) { + ndRangeMap[kern_id].execDone = true; + doneIds.push(kern_id); + + if (ndRangeMap[kern_id].addrToNotify) { + accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1, + 0); + } + + accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1); + + // update event end time (in nano-seconds) + if (ndRangeMap[kern_id].q.depends) { + HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends; + uint64_t event; + shader->ReadMem((uint64_t)(&host_state->event), &event, + sizeof(uint64_t), 0); + + uint64_t end = curTick() / 1000; + + shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end, + sizeof(uint64_t), 0); + } + } + + if (!tickEvent.scheduled()) { + schedule(&tickEvent, curTick() + shader->ticks(1)); + } +} + +void +GpuDispatcher::scheduleDispatch() +{ + if (!tickEvent.scheduled()) + schedule(&tickEvent, curTick() + shader->ticks(1)); +} + +void +GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off) +{ + if (cpu) { + if (off) { + shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq, + true); + val += off; + } + + shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true); + } else { + panic("Cannot find host"); + } +} + +GpuDispatcher::TickEvent::TickEvent(GpuDispatcher *_dispatcher) + : Event(CPU_Tick_Pri), dispatcher(_dispatcher) +{ +} + +void +GpuDispatcher::TickEvent::process() +{ + dispatcher->exec(); +} + +const char* +GpuDispatcher::TickEvent::description() const +{ + return "GPU Dispatcher tick"; +} + +// helper functions for driver to retrieve GPU attributes +int +GpuDispatcher::getNumCUs() +{ + return shader->cuList.size(); +} + +void +GpuDispatcher::setFuncargsSize(int funcargs_size) +{ + shader->funcargs_size = funcargs_size; +} diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh new file mode 100644 index 000000000..76f932655 --- /dev/null +++ b/src/gpu-compute/dispatcher.hh @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Brad Beckmann, Marc Orr + */ + +#ifndef __GPU_DISPATCHER_HH__ +#define __GPU_DISPATCHER_HH__ + +#include +#include + +#include "base/statistics.hh" +#include "dev/dma_device.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/ndrange.hh" +#include "gpu-compute/qstruct.hh" +#include "mem/port.hh" +#include "params/GpuDispatcher.hh" + +class BaseCPU; +class Shader; + +class GpuDispatcher : public DmaDevice +{ + public: + typedef GpuDispatcherParams Params; + + class TickEvent : public Event + { + private: + GpuDispatcher *dispatcher; + + public: + TickEvent(GpuDispatcher *); + void process(); + const char *description() const; + }; + + MasterID masterId() { return _masterId; } + + protected: + MasterID _masterId; + + // Base and length of PIO register space + Addr pioAddr; + Addr pioSize; + Tick pioDelay; + + HsaQueueEntry curTask; + + std::unordered_map ndRangeMap; + NDRange ndRange; + + // list of kernel_ids to launch + std::queue execIds; + // list of kernel_ids that have finished + std::queue doneIds; + + uint64_t dispatchCount; + // is there a kernel in execution? + bool dispatchActive; + + BaseCPU *cpu; + Shader *shader; + ClDriver *driver; + TickEvent tickEvent; + + static GpuDispatcher *instance; + + // sycall emulation mode can have only 1 application running(?) + // else we have to do some pid based tagging + // unused + typedef std::unordered_map TranslationBuffer; + TranslationBuffer tlb; + + public: + /*statistics*/ + Stats::Scalar num_kernelLaunched; + GpuDispatcher(const Params *p); + + ~GpuDispatcher() { } + + void exec(); + virtual void serialize(CheckpointOut &cp) const; + virtual void unserialize(CheckpointIn &cp); + void notifyWgCompl(Wavefront *w); + void scheduleDispatch(); + void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off); + + // using singleton so that glue code can pass pointer locations + // to the dispatcher. when there are multiple dispatchers, we can + // call something like getInstance(index) + static void + setInstance(GpuDispatcher *_instance) + { + instance = _instance; + } + + static GpuDispatcher* getInstance() { return instance; } + + class TLBPort : public MasterPort + { + public: + + TLBPort(const std::string &_name, GpuDispatcher *_dispatcher) + : MasterPort(_name, _dispatcher), dispatcher(_dispatcher) { } + + protected: + GpuDispatcher *dispatcher; + + virtual bool recvTimingResp(PacketPtr pkt) { return true; } + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry() { } + + }; + + TLBPort *tlbPort; + + virtual BaseMasterPort& getMasterPort(const std::string &if_name, + PortID idx); + + AddrRangeList getAddrRanges() const; + Tick read(PacketPtr pkt); + Tick write(PacketPtr pkt); + + // helper functions to retrieve/set GPU attributes + int getNumCUs(); + void setFuncargsSize(int funcargs_size); +}; + +#endif // __GPU_DISPATCHER_HH__ diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc new file mode 100644 index 000000000..c2b95f85e --- /dev/null +++ b/src/gpu-compute/exec_stage.cc @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Sooraj Puthoor + */ + +#include "gpu-compute/exec_stage.hh" + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/wavefront.hh" + +ExecStage::ExecStage(const ComputeUnitParams *p) : numSIMDs(p->num_SIMDs), + numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes), + vectorAluInstAvail(nullptr), glbMemInstAvail(nullptr), + shrMemInstAvail(nullptr), lastTimeInstExecuted(false), + thisTimeInstExecuted(false), instrExecuted (false), + executionResourcesUsed(0) +{ + numTransActiveIdle = 0; + idle_dur = 0; +} + +void +ExecStage::init(ComputeUnit *cu) +{ + computeUnit = cu; + _name = computeUnit->name() + ".ExecStage"; + dispatchList = &computeUnit->dispatchList; + vectorAluInstAvail = &(computeUnit->vectorAluInstAvail); + glbMemInstAvail= &(computeUnit->glbMemInstAvail); + shrMemInstAvail= &(computeUnit->shrMemInstAvail); + idle_dur = 0; +} + +void +ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) { + if (stage == IdleExec) { + // count cycles of no vector ALU instruction executed + // even if one was the oldest in a WV of that vector SIMD unit + if (computeUnit->isVecAlu(unitId) && vectorAluInstAvail->at(unitId)) { + numCyclesWithNoInstrTypeIssued[unitId]++; + } + + // count cycles of no global memory (vector) instruction executed + // even if one was the oldest in a WV of that vector SIMD unit + if (computeUnit->isGlbMem(unitId) && *glbMemInstAvail > 0) { + numCyclesWithNoInstrTypeIssued[unitId]++; + (*glbMemInstAvail)--; + } + + // count cycles of no shared memory (vector) instruction executed + // even if one was the oldest in a WV of that vector SIMD unit + if (computeUnit->isShrMem(unitId) && *shrMemInstAvail > 0) { + numCyclesWithNoInstrTypeIssued[unitId]++; + (*shrMemInstAvail)--; + } + } else if (stage == BusyExec) { + // count the number of cycles an instruction to a specific unit + // was issued + numCyclesWithInstrTypeIssued[unitId]++; + thisTimeInstExecuted = true; + instrExecuted = true; + ++executionResourcesUsed; + } else if (stage == PostExec) { + // count the number of transitions from active to idle + if (lastTimeInstExecuted && !thisTimeInstExecuted) { + ++numTransActiveIdle; + } + + if (!lastTimeInstExecuted && thisTimeInstExecuted) { + idleDur.sample(idle_dur); + idle_dur = 0; + } else if (!thisTimeInstExecuted) { + idle_dur++; + } + + lastTimeInstExecuted = thisTimeInstExecuted; + // track the number of cycles we either issued one vector instruction + // or issued no instructions at all + if (instrExecuted) { + numCyclesWithInstrIssued++; + } else { + numCyclesWithNoIssue++; + } + + spc.sample(executionResourcesUsed); + } +} + +void +ExecStage::initStatistics() +{ + instrExecuted = false; + executionResourcesUsed = 0; + thisTimeInstExecuted = false; +} + +void +ExecStage::exec() +{ + initStatistics(); + + for (int unitId = 0; unitId < (numSIMDs + numMemUnits); ++unitId) { + // if dispatch list for this execution resource is empty, + // skip this execution resource this cycle + if (dispatchList->at(unitId).second == EMPTY) { + collectStatistics(IdleExec, unitId); + continue; + } + + collectStatistics(BusyExec, unitId); + // execute an instruction for the WF + dispatchList->at(unitId).first->exec(); + // clear the dispatch list entry + dispatchList->at(unitId).second = EMPTY; + dispatchList->at(unitId).first = (Wavefront*)nullptr; + } + + collectStatistics(PostExec, 0); +} + +void +ExecStage::regStats() +{ + numTransActiveIdle + .name(name() + ".num_transitions_active_to_idle") + .desc("number of CU transitions from active to idle") + ; + + numCyclesWithNoIssue + .name(name() + ".num_cycles_with_no_issue") + .desc("number of cycles the CU issues nothing") + ; + + numCyclesWithInstrIssued + .name(name() + ".num_cycles_with_instr_issued") + .desc("number of cycles the CU issued at least one instruction") + ; + + spc + .init(0, numSIMDs + numMemUnits, 1) + .name(name() + ".spc") + .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)") + ; + + idleDur + .init(0,75,5) + .name(name() + ".idle_duration_in_cycles") + .desc("duration of idle periods in cycles") + ; + + numCyclesWithInstrTypeIssued + .init(numSIMDs + numMemUnits) + .name(name() + ".num_cycles_with_instrtype_issue") + .desc("Number of cycles at least one instruction of specific type " + "issued") + ; + + numCyclesWithNoInstrTypeIssued + .init(numSIMDs + numMemUnits) + .name(name() + ".num_cycles_with_instr_type_no_issue") + .desc("Number of cycles no instruction of specific type issued") + ; + + for (int i = 0; i < numSIMDs; ++i) { + numCyclesWithInstrTypeIssued.subname(i, csprintf("ALU%d",i)); + numCyclesWithNoInstrTypeIssued.subname(i, csprintf("ALU%d",i)); + } + + numCyclesWithInstrTypeIssued.subname(numSIMDs, csprintf("GM")); + numCyclesWithNoInstrTypeIssued.subname(numSIMDs, csprintf("GM")); + numCyclesWithInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM")); + numCyclesWithNoInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM")); +} diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh new file mode 100644 index 000000000..2de74366b --- /dev/null +++ b/src/gpu-compute/exec_stage.hh @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Sooraj Puthoor + */ + +#ifndef __EXEC_STAGE_HH__ +#define __EXEC_STAGE_HH__ + +#include +#include +#include + +#include "sim/stats.hh" + +class ComputeUnit; +class Wavefront; +struct ComputeUnitParams; + +enum STAT_STATUS +{ + IdleExec, + BusyExec, + PostExec +}; + +enum DISPATCH_STATUS +{ + EMPTY = 0, + FILLED +}; + +// Execution stage. +// Each execution resource executes the +// wave which is in its dispatch list. +// The schedule stage is responsible for +// adding a wave into each execution resource's +// dispatch list. + +class ExecStage +{ + public: + ExecStage(const ComputeUnitParams* params); + ~ExecStage() { } + void init(ComputeUnit *cu); + void exec(); + + std::string name() { return _name; } + void regStats(); + // number of idle cycles + Stats::Scalar numCyclesWithNoIssue; + // number of busy cycles + Stats::Scalar numCyclesWithInstrIssued; + // number of cycles (per execution unit) during which at least one + // instruction was issued to that unit + Stats::Vector numCyclesWithInstrTypeIssued; + // number of idle cycles (per execution unit) during which the unit issued + // no instruction targeting that unit, even though there is at least one + // Wavefront with such an instruction as the oldest + Stats::Vector numCyclesWithNoInstrTypeIssued; + // SIMDs active per cycle + Stats::Distribution spc; + + private: + void collectStatistics(enum STAT_STATUS stage, int unitId); + void initStatistics(); + ComputeUnit *computeUnit; + uint32_t numSIMDs; + + // Number of memory execution resources; + // both global and local memory execution resources in CU + uint32_t numMemUnits; + + // List of waves which will be dispatched to + // each execution resource. A FILLED implies + // dispatch list is non-empty and + // execution unit has something to execute + // this cycle. Currently, the dispatch list of + // an execution resource can hold only one wave because + // an execution resource can execute only one wave in a cycle. + // dispatchList is used to communicate between schedule + // and exec stage + std::vector> *dispatchList; + // flag per vector SIMD unit that is set when there is at least one + // WV that has a vector ALU instruction as the oldest in its + // Instruction Buffer + std::vector *vectorAluInstAvail; + int *glbMemInstAvail; + int *shrMemInstAvail; + bool lastTimeInstExecuted; + bool thisTimeInstExecuted; + bool instrExecuted; + Stats::Scalar numTransActiveIdle; + Stats::Distribution idleDur; + uint32_t executionResourcesUsed; + uint64_t idle_dur; + std::string _name; +}; + +#endif // __EXEC_STAGE_HH__ diff --git a/src/gpu-compute/fetch_stage.cc b/src/gpu-compute/fetch_stage.cc new file mode 100644 index 000000000..1f5e6ded3 --- /dev/null +++ b/src/gpu-compute/fetch_stage.cc @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez, Sooraj Puthoor + */ + +#include "gpu-compute/fetch_stage.hh" + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/wavefront.hh" + +FetchStage::FetchStage(const ComputeUnitParams* p) : numSIMDs(p->num_SIMDs), + computeUnit(nullptr) +{ + for (int j = 0; j < numSIMDs; ++j) { + FetchUnit newFetchUnit(p); + fetchUnit.push_back(newFetchUnit); + } +} + +FetchStage::~FetchStage() +{ + fetchUnit.clear(); +} + +void +FetchStage::init(ComputeUnit *cu) +{ + computeUnit = cu; + _name = computeUnit->name() + ".FetchStage"; + + for (int j = 0; j < numSIMDs; ++j) { + fetchUnit[j].bindWaveList(&computeUnit->wfList[j]); + fetchUnit[j].init(computeUnit); + } +} + +void +FetchStage::exec() +{ + for (int j = 0; j < numSIMDs; ++j) { + fetchUnit[j].exec(); + } +} + +void +FetchStage::processFetchReturn(PacketPtr pkt) +{ + ComputeUnit::SQCPort::SenderState *sender_state = + safe_cast(pkt->senderState); + + Wavefront *wavefront = sender_state->wavefront; + + const unsigned num_instructions = pkt->req->getSize() / + sizeof(TheGpuISA::RawMachInst); + + instFetchInstReturned.sample(num_instructions); + uint32_t simdId = wavefront->simdId; + fetchUnit[simdId].processFetchReturn(pkt); +} + +void +FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront) +{ + fetchUnit[wavefront->simdId].fetch(pkt, wavefront); +} + +void +FetchStage::regStats() +{ + instFetchInstReturned + .init(1, 32, 1) + .name(name() + ".inst_fetch_instr_returned") + .desc("For each instruction fetch request recieved record how many " + "instructions you got from it") + ; +} diff --git a/src/gpu-compute/fetch_stage.hh b/src/gpu-compute/fetch_stage.hh new file mode 100644 index 000000000..ce7faa8ac --- /dev/null +++ b/src/gpu-compute/fetch_stage.hh @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez, Sooraj Puthoor + */ + +#ifndef __FETCH_STAGE_HH__ +#define __FETCH_STAGE_HH__ + +#include +#include + +#include "gpu-compute/fetch_unit.hh" + +// Instruction fetch stage. +// All dispatched wavefronts for all SIMDS are analyzed for the +// need to fetch instructions. From the fetch eligible waves, +// one wave is selected from each SIMD and fetch is initiated +// for the selected waves. + +class ComputeUnit; +class Wavefront; + +class FetchStage +{ + public: + FetchStage(const ComputeUnitParams* params); + ~FetchStage(); + void init(ComputeUnit *cu); + void exec(); + void processFetchReturn(PacketPtr pkt); + void fetch(PacketPtr pkt, Wavefront *wave); + + // Stats related variables and methods + std::string name() { return _name; } + void regStats(); + Stats::Distribution instFetchInstReturned; + + private: + uint32_t numSIMDs; + ComputeUnit *computeUnit; + + // List of fetch units. A fetch unit is + // instantiated per SIMD + std::vector fetchUnit; + std::string _name; +}; + +#endif // __FETCH_STAGE_HH__ diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc new file mode 100644 index 000000000..1f0a7d78e --- /dev/null +++ b/src/gpu-compute/fetch_unit.cc @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Brad Beckmann, Sooraj Puthoor + */ + +#include "gpu-compute/fetch_unit.hh" + +#include "debug/GPUFetch.hh" +#include "debug/GPUPort.hh" +#include "debug/GPUTLB.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" +#include "mem/ruby/system/RubySystem.hh" + +uint32_t FetchUnit::globalFetchUnitID; + +FetchUnit::FetchUnit(const ComputeUnitParams* params) : + timingSim(true), + computeUnit(nullptr), + fetchScheduler(params), + waveList(nullptr) +{ +} + +FetchUnit::~FetchUnit() +{ + fetchQueue.clear(); + fetchStatusQueue.clear(); +} + +void +FetchUnit::init(ComputeUnit *cu) +{ + computeUnit = cu; + timingSim = computeUnit->shader->timingSim; + fetchQueue.clear(); + fetchStatusQueue.resize(computeUnit->shader->n_wf); + + for (int j = 0; j < computeUnit->shader->n_wf; ++j) { + fetchStatusQueue[j] = std::make_pair(waveList->at(j), false); + } + + fetchScheduler.bindList(&fetchQueue); +} + +void +FetchUnit::exec() +{ + // re-evaluate waves which are marked as not ready for fetch + for (int j = 0; j < computeUnit->shader->n_wf; ++j) { + // Following code assumes 64-bit opertaion and all insts are + // represented by 64-bit pointers to inst objects. + Wavefront *curWave = fetchStatusQueue[j].first; + assert (curWave); + + // The wavefront has to be active, the IB occupancy has to be + // 4 or less instructions and it can not have any branches to + // prevent speculative instruction fetches + if (!fetchStatusQueue[j].second) { + if (curWave->status == Wavefront::S_RUNNING && + curWave->instructionBuffer.size() <= 4 && + !curWave->instructionBufferHasBranch() && + !curWave->pendingFetch) { + fetchQueue.push_back(curWave); + fetchStatusQueue[j].second = true; + } + } + } + + // Fetch only if there is some wave ready to be fetched + // An empty fetchQueue will cause the schedular to panic + if (fetchQueue.size()) { + Wavefront *waveToBeFetched = fetchScheduler.chooseWave(); + waveToBeFetched->pendingFetch = true; + fetchStatusQueue[waveToBeFetched->wfSlotId].second = false; + initiateFetch(waveToBeFetched); + } +} + +void +FetchUnit::initiateFetch(Wavefront *wavefront) +{ + // calculate the virtual address to fetch from the SQC + Addr vaddr = wavefront->pc() + wavefront->instructionBuffer.size(); + vaddr = wavefront->base_ptr + vaddr * sizeof(GPUStaticInst*); + + DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n", + computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr); + + // Since this is an instruction prefetch, if you're split then just finish + // out the current line. + unsigned block_size = RubySystem::getBlockSizeBytes(); + // check for split accesses + Addr split_addr = roundDown(vaddr + block_size - 1, block_size); + unsigned size = block_size; + + if (split_addr > vaddr) { + // misaligned access, just grab the rest of the line + size = split_addr - vaddr; + } + + // set up virtual request + Request *req = new Request(0, vaddr, size, Request::INST_FETCH, + computeUnit->masterId(), 0, 0, 0); + + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + // This fetchBlock is kind of faux right now - because the translations so + // far don't actually return Data + uint64_t fetchBlock; + pkt->dataStatic(&fetchBlock); + + if (timingSim) { + // SenderState needed on Return + pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront); + + // Sender State needed by TLB hierarchy + pkt->senderState = + new TheISA::GpuTLB::TranslationState(BaseTLB::Execute, + computeUnit->shader->gpuTc, + false, pkt->senderState); + + if (computeUnit->sqcTLBPort->isStalled()) { + assert(computeUnit->sqcTLBPort->retries.size() > 0); + + DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", + vaddr); + + computeUnit->sqcTLBPort->retries.push_back(pkt); + } else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) { + // Stall the data port; + // No more packet is issued till + // ruby indicates resources are freed by + // a recvReqRetry() call back on this port. + computeUnit->sqcTLBPort->stallPort(); + + DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", + vaddr); + + computeUnit->sqcTLBPort->retries.push_back(pkt); + } else { + DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr); + } + } else { + pkt->senderState = + new TheISA::GpuTLB::TranslationState(BaseTLB::Execute, + computeUnit->shader->gpuTc); + + computeUnit->sqcTLBPort->sendFunctional(pkt); + + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast(pkt->senderState); + + delete sender_state->tlbEntry; + delete sender_state; + // fetch the instructions from the SQC when we operate in + // functional mode only + fetch(pkt, wavefront); + } +} + +void +FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront) +{ + assert(pkt->req->hasPaddr()); + assert(pkt->req->hasSize()); + + DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n", + computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, + pkt->req->getPaddr()); + + // this is necessary because the GPU TLB receives packets instead of + // requests. when the translation is complete, all relevent fields in the + // request will be populated, but not in the packet. here we create the + // new packet so we can set the size, addr, and proper flags. + PacketPtr oldPkt = pkt; + pkt = new Packet(oldPkt->req, oldPkt->cmd); + delete oldPkt; + + TheGpuISA::RawMachInst *data = + new TheGpuISA::RawMachInst[pkt->req->getSize() / + sizeof(TheGpuISA::RawMachInst)]; + + pkt->dataDynamic(data); + + // New SenderState for the memory access + pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront); + + if (timingSim) { + // translation is done. Send the appropriate timing memory request. + + if (!computeUnit->sqcPort->sendTimingReq(pkt)) { + computeUnit->sqcPort->retries.push_back(std::make_pair(pkt, + wavefront)); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n", + computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, + pkt->req->getPaddr()); + } else { + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n", + computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, + pkt->req->getPaddr()); + } + } else { + computeUnit->sqcPort->sendFunctional(pkt); + processFetchReturn(pkt); + } +} + +void +FetchUnit::processFetchReturn(PacketPtr pkt) +{ + ComputeUnit::SQCPort::SenderState *sender_state = + safe_cast(pkt->senderState); + + Wavefront *wavefront = sender_state->wavefront; + + DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned " + "%d bytes, %d instructions!\n", computeUnit->cu_id, + wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(), + pkt->req->getSize(), pkt->req->getSize() / + sizeof(TheGpuISA::RawMachInst)); + + if (wavefront->dropFetch) { + assert(wavefront->instructionBuffer.empty()); + wavefront->dropFetch = false; + } else { + TheGpuISA::RawMachInst *inst_index_ptr = + (TheGpuISA::RawMachInst*)pkt->getPtr(); + + assert(wavefront->instructionBuffer.size() <= 4); + + for (int i = 0; i < pkt->req->getSize() / + sizeof(TheGpuISA::RawMachInst); ++i) { + GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]); + + assert(inst_ptr); + DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n", + computeUnit->cu_id, wavefront->simdId, + wavefront->wfSlotId, inst_ptr->disassemble()); + + GPUDynInstPtr gpuDynInst = + std::make_shared(computeUnit, wavefront, inst_ptr, + computeUnit->getAndIncSeqNum()); + + wavefront->instructionBuffer.push_back(gpuDynInst); + } + } + + wavefront->pendingFetch = false; + + delete pkt->senderState; + delete pkt->req; + delete pkt; +} + +void +FetchUnit::bindWaveList(std::vector *wave_list) +{ + waveList = wave_list; +} diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh new file mode 100644 index 000000000..c7c6afb3c --- /dev/null +++ b/src/gpu-compute/fetch_unit.hh @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Brad Beckmann, Sooraj Puthoor + */ + +#ifndef __FETCH_UNIT_HH__ +#define __FETCH_UNIT_HH__ + +#include +#include +#include + +#include "arch/gpu_decoder.hh" +#include "base/statistics.hh" +#include "config/the_gpu_isa.hh" +#include "gpu-compute/scheduler.hh" +#include "mem/packet.hh" + +class ComputeUnit; +class Wavefront; + +class FetchUnit +{ + public: + FetchUnit(const ComputeUnitParams* params); + ~FetchUnit(); + void init(ComputeUnit *cu); + void exec(); + void bindWaveList(std::vector *list); + void initiateFetch(Wavefront *wavefront); + void fetch(PacketPtr pkt, Wavefront *wavefront); + void processFetchReturn(PacketPtr pkt); + static uint32_t globalFetchUnitID; + + private: + bool timingSim; + ComputeUnit *computeUnit; + TheGpuISA::Decoder decoder; + + // Fetch scheduler; Selects one wave from + // the fetch queue for instruction fetching. + // The selection is made according to + // a scheduling policy + Scheduler fetchScheduler; + + // Stores the list of waves that are + // ready to be fetched this cycle + std::vector fetchQueue; + + // Stores the fetch status of all waves dispatched to this SIMD. + // TRUE implies the wave is ready to fetch and is already + // moved to fetchQueue + std::vector> fetchStatusQueue; + + // Pointer to list of waves dispatched on to this SIMD unit + std::vector *waveList; +}; + +#endif // __FETCH_UNIT_HH__ diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc new file mode 100644 index 000000000..913327412 --- /dev/null +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Sooraj Puthoor + */ + +#include "gpu-compute/global_memory_pipeline.hh" + +#include "debug/GPUMem.hh" +#include "debug/GPUReg.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/vector_register_file.hh" +#include "gpu-compute/wavefront.hh" + +GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) : + computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size), + inflightStores(0), inflightLoads(0) +{ +} + +void +GlobalMemPipeline::init(ComputeUnit *cu) +{ + computeUnit = cu; + globalMemSize = computeUnit->shader->globalMemSize; + _name = computeUnit->name() + ".GlobalMemPipeline"; +} + +void +GlobalMemPipeline::exec() +{ + // apply any returned global memory operations + GPUDynInstPtr m = !gmReturnedLoads.empty() ? gmReturnedLoads.front() : + !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr; + + bool accessVrf = true; + // check the VRF to see if the operands of a load (or load component + // of an atomic) are accessible + if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) { + Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; + + accessVrf = + w->computeUnit->vrf[m->simdId]-> + vrfOperandAccessReady(m->seqNum(), w, m, + VrfAccessType::WRITE); + } + + if ((!gmReturnedStores.empty() || !gmReturnedLoads.empty()) && + m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() && + accessVrf && m->statusBitVector == VectorMask(0) && + (computeUnit->shader->coissue_return || + computeUnit->wfWait.at(m->pipeId).rdy())) { + + if (m->v_type == VT_32 && m->m_type == Enums::M_U8) + doGmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_U16) + doGmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_U32) + doGmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S8) + doGmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S16) + doGmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S32) + doGmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_F16) + doGmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_F32) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U8) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U16) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U32) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U64) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S8) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S16) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S32) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S64) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F16) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F32) + doGmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F64) + doGmReturn(m); + } + + // If pipeline has executed a global memory instruction + // execute global memory packets and issue global + // memory packets to DTLB + if (!gmIssuedRequests.empty()) { + GPUDynInstPtr mp = gmIssuedRequests.front(); + if (mp->m_op == Enums::MO_LD || + (mp->m_op >= Enums::MO_AAND && mp->m_op <= Enums::MO_AMIN) || + (mp->m_op >= Enums::MO_ANRAND && mp->m_op <= Enums::MO_ANRMIN)) { + + if (inflightLoads >= gmQueueSize) { + return; + } else { + ++inflightLoads; + } + } else { + if (inflightStores >= gmQueueSize) { + return; + } else { + ++inflightStores; + } + } + + mp->initiateAcc(mp); + gmIssuedRequests.pop(); + + DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = %s\n", + computeUnit->cu_id, mp->simdId, mp->wfSlotId, + Enums::MemOpTypeStrings[mp->m_op]); + } +} + +template +void +GlobalMemPipeline::doGmReturn(GPUDynInstPtr m) +{ + Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; + + // Return data to registers + if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + gmReturnedLoads.pop(); + assert(inflightLoads > 0); + --inflightLoads; + + if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) { + std::vector regVec; + // iterate over number of destination register operands since + // this is a load or atomic operation + for (int k = 0; k < m->n_reg; ++k) { + assert((sizeof(c1) * m->n_reg) <= MAX_WIDTH_FOR_MEM_INST); + int dst = m->dst_reg + k; + + if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) + dst = m->dst_reg_vec[k]; + // virtual->physical VGPR mapping + int physVgpr = w->remap(dst, sizeof(c0), 1); + // save the physical VGPR index + regVec.push_back(physVgpr); + c1 *p1 = &((c1*)m->d_data)[k * VSZ]; + + for (int i = 0; i < VSZ; ++i) { + if (m->exec_mask[i]) { + DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " + "$%s%d <- %d global ld done (src = wavefront " + "ld inst)\n", w->computeUnit->cu_id, w->simdId, + w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d", + dst, *p1); + // write the value into the physical VGPR. This is a + // purely functional operation. No timing is modeled. + w->computeUnit->vrf[w->simdId]->write(physVgpr, + *p1, i); + } + ++p1; + } + } + + // Schedule the write operation of the load data on the VRF. + // This simply models the timing aspect of the VRF write operation. + // It does not modify the physical VGPR. + loadVrfBankConflictCycles += + w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), + w, regVec, sizeof(c0), + m->time); + } + } else { + gmReturnedStores.pop(); + assert(inflightStores > 0); + --inflightStores; + } + + // Decrement outstanding register count + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1); + + if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) || + MO_H(m->m_op)) { + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_gm, m->time, + -1); + } + + if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_gm, m->time, + -1); + } + + // Mark write bus busy for appropriate amount of time + computeUnit->glbMemToVrfBus.set(m->time); + if (!computeUnit->shader->coissue_return) + w->computeUnit->wfWait.at(m->pipeId).set(m->time); +} + +void +GlobalMemPipeline::regStats() +{ + loadVrfBankConflictCycles + .name(name() + ".load_vrf_bank_conflict_cycles") + .desc("total number of cycles GM data are delayed before updating " + "the VRF") + ; +} diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh new file mode 100644 index 000000000..ed49f6f6b --- /dev/null +++ b/src/gpu-compute/global_memory_pipeline.hh @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Sooraj Puthoor + */ + +#ifndef __GLOBAL_MEMORY_PIPELINE_HH__ +#define __GLOBAL_MEMORY_PIPELINE_HH__ + +#include +#include + +#include "gpu-compute/misc.hh" +#include "params/ComputeUnit.hh" +#include "sim/stats.hh" + +/* + * @file global_memory_pipeline.hh + * + * The global memory pipeline issues newly created global memory packets + * from the pipeline to DTLB. The exec() method of the memory packet issues + * the packet to the DTLB if there is space available in the return fifo. + * This stage also retires previously issued loads and stores that have + * returned from the memory sub-system. + */ + +class ComputeUnit; + +class GlobalMemPipeline +{ + public: + GlobalMemPipeline(const ComputeUnitParams *params); + void init(ComputeUnit *cu); + void exec(); + + template void doGmReturn(GPUDynInstPtr m); + + std::queue &getGMReqFIFO() { return gmIssuedRequests; } + std::queue &getGMStRespFIFO() { return gmReturnedStores; } + std::queue &getGMLdRespFIFO() { return gmReturnedLoads; } + + bool + isGMLdRespFIFOWrRdy() const + { + return gmReturnedLoads.size() < gmQueueSize; + } + + bool + isGMStRespFIFOWrRdy() const + { + return gmReturnedStores.size() < gmQueueSize; + } + + bool + isGMReqFIFOWrRdy(uint32_t pendReqs=0) const + { + return (gmIssuedRequests.size() + pendReqs) < gmQueueSize; + } + + const std::string &name() const { return _name; } + void regStats(); + + private: + ComputeUnit *computeUnit; + std::string _name; + int gmQueueSize; + + // number of cycles of delaying the update of a VGPR that is the + // target of a load instruction (or the load component of an atomic) + // The delay is due to VRF bank conflicts + Stats::Scalar loadVrfBankConflictCycles; + // Counters to track the inflight loads and stores + // so that we can provide the proper backpressure + // on the number of inflight memory operations. + int inflightStores; + int inflightLoads; + + // The size of global memory. + int globalMemSize; + + // Global Memory Request FIFO: all global memory requests + // are issued to this FIFO from the memory pipelines + std::queue gmIssuedRequests; + + // Globa Store Response FIFO: all responses of global memory + // stores are sent to this FIFO from TCP + std::queue gmReturnedStores; + + // Global Load Response FIFO: all responses of global memory + // loads are sent to this FIFO from TCP + std::queue gmReturnedLoads; +}; + +#endif // __GLOBAL_MEMORY_PIPELINE_HH__ diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc new file mode 100644 index 000000000..83e348dbe --- /dev/null +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "gpu-compute/gpu_dyn_inst.hh" + +#include "debug/GPUMem.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" + +GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, + GPUStaticInst *_staticInst, uint64_t instSeqNum) + : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF), + memoryOrder(Enums::MEMORY_ORDER_NONE), useContinuation(false), + statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum) +{ + tlbHitLevel.assign(VSZ, -1); +} + +void +GPUDynInst::execute() +{ + GPUDynInstPtr gpuDynInst = std::make_shared(cu, wf, staticInst, + _seqNum); + staticInst->execute(gpuDynInst); +} + +int +GPUDynInst::numSrcRegOperands() +{ + return staticInst->numSrcRegOperands(); +} + +int +GPUDynInst::numDstRegOperands() +{ + return staticInst->numDstRegOperands(); +} + +int +GPUDynInst::getNumOperands() +{ + return staticInst->getNumOperands(); +} + +bool +GPUDynInst::isVectorRegister(int operandIdx) +{ + return staticInst->isVectorRegister(operandIdx); +} + +bool +GPUDynInst::isScalarRegister(int operandIdx) +{ + return staticInst->isVectorRegister(operandIdx); +} + +int +GPUDynInst::getRegisterIndex(int operandIdx) +{ + return staticInst->getRegisterIndex(operandIdx); +} + +int +GPUDynInst::getOperandSize(int operandIdx) +{ + return staticInst->getOperandSize(operandIdx); +} + +bool +GPUDynInst::isDstOperand(int operandIdx) +{ + return staticInst->isDstOperand(operandIdx); +} + +bool +GPUDynInst::isSrcOperand(int operandIdx) +{ + return staticInst->isSrcOperand(operandIdx); +} + +bool +GPUDynInst::isArgLoad() +{ + return staticInst->isArgLoad(); +} + +const std::string& +GPUDynInst::disassemble() const +{ + return staticInst->disassemble(); +} + +uint64_t +GPUDynInst::seqNum() const +{ + return _seqNum; +} + +Enums::OpType +GPUDynInst::opType() +{ + return staticInst->o_type; +} + +Enums::StorageClassType +GPUDynInst::executedAs() +{ + return staticInst->executed_as; +} + +// Process a memory instruction and (if necessary) submit timing request +void +GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst) +{ + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n", + cu->cu_id, simdId, wfSlotId, exec_mask); + + staticInst->initiateAcc(gpuDynInst); + time = 0; +} + +bool +GPUDynInst::scalarOp() const +{ + return staticInst->scalarOp(); +} + +void +GPUDynInst::updateStats() +{ + if (staticInst->isLocalMem()) { + // access to LDS (shared) memory + cu->dynamicLMemInstrCnt++; + } else { + // access to global memory + + // update PageDivergence histogram + int number_pages_touched = cu->pagesTouched.size(); + assert(number_pages_touched); + cu->pageDivergenceDist.sample(number_pages_touched); + + std::pair ret; + + for (auto it : cu->pagesTouched) { + // see if this page has been touched before. if not, this also + // inserts the page into the table. + ret = cu->pageAccesses + .insert(ComputeUnit::pageDataStruct::value_type(it.first, + std::make_pair(1, it.second))); + + // if yes, then update the stats + if (!ret.second) { + ret.first->second.first++; + ret.first->second.second += it.second; + } + } + + cu->pagesTouched.clear(); + + // total number of memory instructions (dynamic) + // Atomics are counted as a single memory instruction. + // this is # memory instructions per wavefronts, not per workitem + cu->dynamicGMemInstrCnt++; + } +} diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh new file mode 100644 index 000000000..e44d8f80d --- /dev/null +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -0,0 +1,464 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __GPU_DYN_INST_HH__ +#define __GPU_DYN_INST_HH__ + +#include +#include + +#include "enums/GenericMemoryOrder.hh" +#include "enums/GenericMemoryScope.hh" +#include "enums/MemOpType.hh" +#include "enums/MemType.hh" +#include "enums/OpType.hh" +#include "enums/StorageClassType.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_exec_context.hh" + +class GPUStaticInst; + +template +class AtomicOpAnd : public TypedAtomicOpFunctor +{ + public: + T a; + + AtomicOpAnd(T _a) : a(_a) { } + void execute(T *b) { *b &= a; } +}; + +template +class AtomicOpOr : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpOr(T _a) : a(_a) { } + void execute(T *b) { *b |= a; } +}; + +template +class AtomicOpXor : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpXor(T _a) : a(_a) {} + void execute(T *b) { *b ^= a; } +}; + +template +class AtomicOpCAS : public TypedAtomicOpFunctor +{ + public: + T c; + T s; + + ComputeUnit *computeUnit; + + AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit) + : c(_c), s(_s), computeUnit(compute_unit) { } + + void + execute(T *b) + { + computeUnit->numCASOps++; + + if (*b == c) { + *b = s; + } else { + computeUnit->numFailedCASOps++; + } + + if (computeUnit->xact_cas_mode) { + computeUnit->xactCasLoadMap.clear(); + } + } +}; + +template +class AtomicOpExch : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpExch(T _a) : a(_a) { } + void execute(T *b) { *b = a; } +}; + +template +class AtomicOpAdd : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpAdd(T _a) : a(_a) { } + void execute(T *b) { *b += a; } +}; + +template +class AtomicOpSub : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpSub(T _a) : a(_a) { } + void execute(T *b) { *b -= a; } +}; + +template +class AtomicOpInc : public TypedAtomicOpFunctor +{ + public: + AtomicOpInc() { } + void execute(T *b) { *b += 1; } +}; + +template +class AtomicOpDec : public TypedAtomicOpFunctor +{ + public: + AtomicOpDec() {} + void execute(T *b) { *b -= 1; } +}; + +template +class AtomicOpMax : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpMax(T _a) : a(_a) { } + + void + execute(T *b) + { + if (a > *b) + *b = a; + } +}; + +template +class AtomicOpMin : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpMin(T _a) : a(_a) {} + + void + execute(T *b) + { + if (a < *b) + *b = a; + } +}; + +#define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN) +#define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN) +#define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN) + +typedef enum +{ + VT_32, + VT_64, +} vgpr_type; + +typedef enum +{ + SEG_PRIVATE, + SEG_SPILL, + SEG_GLOBAL, + SEG_SHARED, + SEG_READONLY, + SEG_FLAT +} seg_type; + +class GPUDynInst : public GPUExecContext +{ + public: + GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst, + uint64_t instSeqNum); + + void execute(); + int numSrcRegOperands(); + int numDstRegOperands(); + int getNumOperands(); + bool isVectorRegister(int operandIdx); + bool isScalarRegister(int operandIdx); + int getRegisterIndex(int operandIdx); + int getOperandSize(int operandIdx); + bool isDstOperand(int operandIdx); + bool isSrcOperand(int operandIdx); + bool isArgLoad(); + + const std::string &disassemble() const; + + uint64_t seqNum() const; + + Enums::OpType opType(); + Enums::StorageClassType executedAs(); + + // The address of the memory operation + Addr addr[VSZ]; + Addr pAddr; + + // The data to get written + uint8_t d_data[VSZ * 16]; + // Additional data (for atomics) + uint8_t a_data[VSZ * 8]; + // Additional data (for atomics) + uint8_t x_data[VSZ * 8]; + // The execution mask + VectorMask exec_mask; + + // The memory type (M_U32, M_S32, ...) + Enums::MemType m_type; + // The memory operation (MO_LD, MO_ST, ...) + Enums::MemOpType m_op; + Enums::GenericMemoryOrder memoryOrder; + + // Scope of the request + Enums::GenericMemoryScope scope; + // The memory segment (SEG_SHARED, SEG_GLOBAL, ...) + seg_type s_type; + // The equivalency class + int equiv; + // The return VGPR type (VT_32 or VT_64) + vgpr_type v_type; + // Number of VGPR's accessed (1, 2, or 4) + int n_reg; + // The return VGPR index + int dst_reg; + // There can be max 4 dest regs> + int dst_reg_vec[4]; + // SIMD where the WF of the memory instruction has been mapped to + int simdId; + // unique id of the WF where the memory instruction belongs to + int wfDynId; + // The kernel id of the requesting wf + int kern_id; + // The CU id of the requesting wf + int cu_id; + // HW slot id where the WF is mapped to inside a SIMD unit + int wfSlotId; + // execution pipeline id where the memory instruction has been scheduled + int pipeId; + // The execution time of this operation + Tick time; + // The latency of this operation + WaitClass latency; + // A list of bank conflicts for the 4 cycles. + uint32_t bc[4]; + + // A pointer to ROM + uint8_t *rom; + // The size of the READONLY segment + int sz_rom; + + // Initiate the specified memory operation, by creating a + // memory request and sending it off to the memory system. + void initiateAcc(GPUDynInstPtr gpuDynInst); + + void updateStats(); + + GPUStaticInst* staticInstruction() { return staticInst; } + + // Is the instruction a scalar or vector op? + bool scalarOp() const; + + /* + * Loads/stores/atomics may have acquire/release semantics associated + * withthem. Some protocols want to see the acquire/release as separate + * requests from the load/store/atomic. We implement that separation + * using continuations (i.e., a function pointer with an object associated + * with it). When, for example, the front-end generates a store with + * release semantics, we will first issue a normal store and set the + * continuation in the GPUDynInst to a function that generate a + * release request. That continuation will be called when the normal + * store completes (in ComputeUnit::DataPort::recvTimingResponse). The + * continuation will be called in the context of the same GPUDynInst + * that generated the initial store. + */ + std::function execContinuation; + + // when true, call execContinuation when response arrives + bool useContinuation; + + template AtomicOpFunctor* + makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op) + { + using namespace Enums; + + switch(op) { + case MO_AAND: + case MO_ANRAND: + return new AtomicOpAnd(*reg0); + case MO_AOR: + case MO_ANROR: + return new AtomicOpOr(*reg0); + case MO_AXOR: + case MO_ANRXOR: + return new AtomicOpXor(*reg0); + case MO_ACAS: + case MO_ANRCAS: + return new AtomicOpCAS(*reg0, *reg1, cu); + case MO_AEXCH: + case MO_ANREXCH: + return new AtomicOpExch(*reg0); + case MO_AADD: + case MO_ANRADD: + return new AtomicOpAdd(*reg0); + case MO_ASUB: + case MO_ANRSUB: + return new AtomicOpSub(*reg0); + case MO_AINC: + case MO_ANRINC: + return new AtomicOpInc(); + case MO_ADEC: + case MO_ANRDEC: + return new AtomicOpDec(); + case MO_AMAX: + case MO_ANRMAX: + return new AtomicOpMax(*reg0); + case MO_AMIN: + case MO_ANRMIN: + return new AtomicOpMin(*reg0); + default: + panic("Unrecognized atomic operation"); + } + } + + void + setRequestFlags(Request *req, bool setMemOrder=true) + { + // currently these are the easy scopes to deduce + switch (s_type) { + case SEG_PRIVATE: + req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT); + break; + case SEG_SPILL: + req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT); + break; + case SEG_GLOBAL: + req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT); + break; + case SEG_READONLY: + req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT); + break; + case SEG_SHARED: + req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT); + break; + case SEG_FLAT: + // TODO: translate to correct scope + assert(false); + default: + panic("Bad segment type"); + break; + } + + switch (scope) { + case Enums::MEMORY_SCOPE_NONE: + case Enums::MEMORY_SCOPE_WORKITEM: + break; + case Enums::MEMORY_SCOPE_WAVEFRONT: + req->setMemSpaceConfigFlags(Request::SCOPE_VALID | + Request::WAVEFRONT_SCOPE); + break; + case Enums::MEMORY_SCOPE_WORKGROUP: + req->setMemSpaceConfigFlags(Request::SCOPE_VALID | + Request::WORKGROUP_SCOPE); + break; + case Enums::MEMORY_SCOPE_DEVICE: + req->setMemSpaceConfigFlags(Request::SCOPE_VALID | + Request::DEVICE_SCOPE); + break; + case Enums::MEMORY_SCOPE_SYSTEM: + req->setMemSpaceConfigFlags(Request::SCOPE_VALID | + Request::SYSTEM_SCOPE); + break; + default: + panic("Bad scope type"); + break; + } + + if (setMemOrder) { + // set acquire and release flags + switch (memoryOrder){ + case Enums::MEMORY_ORDER_SC_ACQUIRE: + req->setFlags(Request::ACQUIRE); + break; + case Enums::MEMORY_ORDER_SC_RELEASE: + req->setFlags(Request::RELEASE); + break; + case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE: + req->setFlags(Request::ACQUIRE | Request::RELEASE); + break; + default: + break; + } + } + + // set atomic type + // currently, the instruction genenerator only produces atomic return + // but a magic instruction can produce atomic no return + if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB || + m_op == Enums::MO_AAND || m_op == Enums::MO_AOR || + m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX || + m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC || + m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH || + m_op == Enums::MO_ACAS) { + req->setFlags(Request::ATOMIC_RETURN_OP); + } else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB || + m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR || + m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX || + m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC || + m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH || + m_op == Enums::MO_ANRCAS) { + req->setFlags(Request::ATOMIC_NO_RETURN_OP); + } + } + + // Map returned packets and the addresses they satisfy with which lane they + // were requested from + typedef std::unordered_map> StatusVector; + StatusVector memStatusVector; + + // Track the status of memory requests per lane, a bit per lane + VectorMask statusBitVector; + // for ld_v# or st_v# + std::vector statusVector; + std::vector tlbHitLevel; + + private: + GPUStaticInst *staticInst; + uint64_t _seqNum; +}; + +#endif // __GPU_DYN_INST_HH__ diff --git a/src/gpu-compute/gpu_exec_context.cc b/src/gpu-compute/gpu_exec_context.cc new file mode 100644 index 000000000..4af69c41e --- /dev/null +++ b/src/gpu-compute/gpu_exec_context.cc @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "gpu-compute/gpu_exec_context.hh" + +GPUExecContext::GPUExecContext(ComputeUnit *_cu, Wavefront *_wf) + : cu(_cu), wf(_wf) +{ +} + +ComputeUnit* +GPUExecContext::computeUnit() +{ + return cu; +} + +Wavefront* +GPUExecContext::wavefront() +{ + return wf; +} diff --git a/src/gpu-compute/gpu_exec_context.hh b/src/gpu-compute/gpu_exec_context.hh new file mode 100644 index 000000000..a3deb9b8f --- /dev/null +++ b/src/gpu-compute/gpu_exec_context.hh @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __GPU_EXEC_CONTEXT_HH__ +#define __GPU_EXEC_CONTEXT_HH__ + +class ComputeUnit; +class Wavefront; + +class GPUExecContext +{ + public: + GPUExecContext(ComputeUnit *_cu, Wavefront *_wf); + Wavefront* wavefront(); + ComputeUnit* computeUnit(); + + protected: + ComputeUnit *cu; + Wavefront *wf; +}; + +#endif // __GPU_EXEC_CONTEXT_HH__ diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc new file mode 100644 index 000000000..bcb8a5f3d --- /dev/null +++ b/src/gpu-compute/gpu_static_inst.cc @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "gpu-compute/gpu_static_inst.hh" + +GPUStaticInst::GPUStaticInst(const std::string &opcode) + : o_type(Enums::OT_ALU), executed_as(Enums::SC_NONE), opcode(opcode), + _instNum(0), _scalarOp(false) +{ +} diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh new file mode 100644 index 000000000..c1de28427 --- /dev/null +++ b/src/gpu-compute/gpu_static_inst.hh @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __GPU_STATIC_INST_HH__ +#define __GPU_STATIC_INST_HH__ + +/* + * @file gpu_static_inst.hh + * + * Defines the base class representing static instructions for the GPU. The + * instructions are "static" because they contain no dynamic instruction + * information. GPUStaticInst corresponds to the StaticInst class for the CPU + * models. + */ + +#include +#include + +#include "enums/OpType.hh" +#include "enums/StorageClassType.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/misc.hh" + +class BaseOperand; +class BaseRegOperand; +class Wavefront; + +class GPUStaticInst +{ + public: + GPUStaticInst(const std::string &opcode); + + void instNum(int num) { _instNum = num; } + + int instNum() { return _instNum; } + + void ipdInstNum(int num) { _ipdInstNum = num; } + + int ipdInstNum() const { return _ipdInstNum; } + + virtual void execute(GPUDynInstPtr gpuDynInst) = 0; + virtual void generateDisassembly() = 0; + virtual const std::string &disassemble() = 0; + virtual int getNumOperands() = 0; + virtual bool isCondRegister(int operandIndex) = 0; + virtual bool isScalarRegister(int operandIndex) = 0; + virtual bool isVectorRegister(int operandIndex) = 0; + virtual bool isSrcOperand(int operandIndex) = 0; + virtual bool isDstOperand(int operandIndex) = 0; + virtual int getOperandSize(int operandIndex) = 0; + virtual int getRegisterIndex(int operandIndex) = 0; + virtual int numDstRegOperands() = 0; + virtual int numSrcRegOperands() = 0; + + /* + * Most instructions (including all HSAIL instructions) + * are vector ops, so _scalarOp will be false by default. + * Derived instruction objects that are scalar ops must + * set _scalarOp to true in their constructors. + */ + bool scalarOp() const { return _scalarOp; } + + virtual bool isLocalMem() const + { + fatal("calling isLocalMem() on non-memory instruction.\n"); + + return false; + } + + bool isArgLoad() { return false; } + virtual uint32_t instSize() = 0; + + // only used for memory instructions + virtual void + initiateAcc(GPUDynInstPtr gpuDynInst) + { + fatal("calling initiateAcc() on a non-memory instruction.\n"); + } + + virtual uint32_t getTargetPc() { return 0; } + + /** + * Query whether the instruction is an unconditional jump i.e., the jump + * is always executed because there is no condition to be evaluated. + * + * If the instruction is not of branch type, the result is always false. + * + * @return True if the instruction is an unconditional jump. + */ + virtual bool unconditionalJumpInstruction() { return false; } + + static uint64_t dynamic_id_count; + + Enums::OpType o_type; + // For flat memory accesses + Enums::StorageClassType executed_as; + + protected: + virtual void + execLdAcq(GPUDynInstPtr gpuDynInst) + { + fatal("calling execLdAcq() on a non-load instruction.\n"); + } + + virtual void + execSt(GPUDynInstPtr gpuDynInst) + { + fatal("calling execLdAcq() on a non-load instruction.\n"); + } + + virtual void + execAtomic(GPUDynInstPtr gpuDynInst) + { + fatal("calling execAtomic() on a non-atomic instruction.\n"); + } + + virtual void + execAtomicAcq(GPUDynInstPtr gpuDynInst) + { + fatal("calling execAtomicAcq() on a non-atomic instruction.\n"); + } + + const std::string opcode; + std::string disassembly; + int _instNum; + /** + * Identifier of the immediate post-dominator instruction. + */ + int _ipdInstNum; + + bool _scalarOp; +}; + +#endif // __GPU_STATIC_INST_HH__ diff --git a/src/gpu-compute/gpu_tlb.cc b/src/gpu-compute/gpu_tlb.cc new file mode 100644 index 000000000..de005fd04 --- /dev/null +++ b/src/gpu-compute/gpu_tlb.cc @@ -0,0 +1,1801 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#include "gpu-compute/gpu_tlb.hh" + +#include +#include + +#include "arch/x86/faults.hh" +#include "arch/x86/insts/microldstop.hh" +#include "arch/x86/pagetable.hh" +#include "arch/x86/pagetable_walker.hh" +#include "arch/x86/regs/misc.hh" +#include "arch/x86/x86_traits.hh" +#include "base/bitfield.hh" +#include "base/output.hh" +#include "base/trace.hh" +#include "cpu/base.hh" +#include "cpu/thread_context.hh" +#include "debug/GPUPrefetch.hh" +#include "debug/GPUTLB.hh" +#include "mem/packet_access.hh" +#include "mem/page_table.hh" +#include "mem/request.hh" +#include "sim/process.hh" + +namespace X86ISA +{ + + GpuTLB::GpuTLB(const Params *p) + : MemObject(p), configAddress(0), size(p->size), + cleanupEvent(this, false, Event::Maximum_Pri), exitEvent(this) + { + assoc = p->assoc; + assert(assoc <= size); + numSets = size/assoc; + allocationPolicy = p->allocationPolicy; + hasMemSidePort = false; + accessDistance = p->accessDistance; + clock = p->clk_domain->clockPeriod(); + + tlb = new GpuTlbEntry[size]; + std::memset(tlb, 0, sizeof(GpuTlbEntry) * size); + + freeList.resize(numSets); + entryList.resize(numSets); + + for (int set = 0; set < numSets; ++set) { + for (int way = 0; way < assoc; ++way) { + int x = set*assoc + way; + freeList[set].push_back(&tlb[x]); + } + } + + FA = (size == assoc); + + /** + * @warning: the set-associative version assumes you have a + * fixed page size of 4KB. + * If the page size is greather than 4KB (as defined in the + * TheISA::PageBytes), then there are various issues w/ the current + * implementation (you'd have the same 8KB page being replicated in + * different sets etc) + */ + setMask = numSets - 1; + + #if 0 + // GpuTLB doesn't yet support full system + walker = p->walker; + walker->setTLB(this); + #endif + + maxCoalescedReqs = p->maxOutstandingReqs; + + // Do not allow maxCoalescedReqs to be more than the TLB associativity + if (maxCoalescedReqs > assoc) { + maxCoalescedReqs = assoc; + cprintf("Forcing maxCoalescedReqs to %d (TLB assoc.) \n", assoc); + } + + outstandingReqs = 0; + hitLatency = p->hitLatency; + missLatency1 = p->missLatency1; + missLatency2 = p->missLatency2; + + // create the slave ports based on the number of connected ports + for (size_t i = 0; i < p->port_slave_connection_count; ++i) { + cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", + name(), i), this, i)); + } + + // create the master ports based on the number of connected ports + for (size_t i = 0; i < p->port_master_connection_count; ++i) { + memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", + name(), i), this, i)); + } + } + + // fixme: this is never called? + GpuTLB::~GpuTLB() + { + // make sure all the hash-maps are empty + assert(translationReturnEvent.empty()); + + // delete the TLB + delete[] tlb; + } + + BaseSlavePort& + GpuTLB::getSlavePort(const std::string &if_name, PortID idx) + { + if (if_name == "slave") { + if (idx >= static_cast(cpuSidePort.size())) { + panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx); + } + + return *cpuSidePort[idx]; + } else { + panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name); + } + } + + BaseMasterPort& + GpuTLB::getMasterPort(const std::string &if_name, PortID idx) + { + if (if_name == "master") { + if (idx >= static_cast(memSidePort.size())) { + panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx); + } + + hasMemSidePort = true; + + return *memSidePort[idx]; + } else { + panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name); + } + } + + GpuTlbEntry* + GpuTLB::insert(Addr vpn, GpuTlbEntry &entry) + { + GpuTlbEntry *newEntry = nullptr; + + /** + * vpn holds the virtual page address + * The least significant bits are simply masked + */ + int set = (vpn >> TheISA::PageShift) & setMask; + + if (!freeList[set].empty()) { + newEntry = freeList[set].front(); + freeList[set].pop_front(); + } else { + newEntry = entryList[set].back(); + entryList[set].pop_back(); + } + + *newEntry = entry; + newEntry->vaddr = vpn; + entryList[set].push_front(newEntry); + + return newEntry; + } + + GpuTLB::EntryList::iterator + GpuTLB::lookupIt(Addr va, bool update_lru) + { + int set = (va >> TheISA::PageShift) & setMask; + + if (FA) { + assert(!set); + } + + auto entry = entryList[set].begin(); + for (; entry != entryList[set].end(); ++entry) { + int page_size = (*entry)->size(); + + if ((*entry)->vaddr <= va && (*entry)->vaddr + page_size > va) { + DPRINTF(GPUTLB, "Matched vaddr %#x to entry starting at %#x " + "with size %#x.\n", va, (*entry)->vaddr, page_size); + + if (update_lru) { + entryList[set].push_front(*entry); + entryList[set].erase(entry); + entry = entryList[set].begin(); + } + + break; + } + } + + return entry; + } + + GpuTlbEntry* + GpuTLB::lookup(Addr va, bool update_lru) + { + int set = (va >> TheISA::PageShift) & setMask; + + auto entry = lookupIt(va, update_lru); + + if (entry == entryList[set].end()) + return nullptr; + else + return *entry; + } + + void + GpuTLB::invalidateAll() + { + DPRINTF(GPUTLB, "Invalidating all entries.\n"); + + for (int i = 0; i < numSets; ++i) { + while (!entryList[i].empty()) { + GpuTlbEntry *entry = entryList[i].front(); + entryList[i].pop_front(); + freeList[i].push_back(entry); + } + } + } + + void + GpuTLB::setConfigAddress(uint32_t addr) + { + configAddress = addr; + } + + void + GpuTLB::invalidateNonGlobal() + { + DPRINTF(GPUTLB, "Invalidating all non global entries.\n"); + + for (int i = 0; i < numSets; ++i) { + for (auto entryIt = entryList[i].begin(); + entryIt != entryList[i].end();) { + if (!(*entryIt)->global) { + freeList[i].push_back(*entryIt); + entryList[i].erase(entryIt++); + } else { + ++entryIt; + } + } + } + } + + void + GpuTLB::demapPage(Addr va, uint64_t asn) + { + + int set = (va >> TheISA::PageShift) & setMask; + auto entry = lookupIt(va, false); + + if (entry != entryList[set].end()) { + freeList[set].push_back(*entry); + entryList[set].erase(entry); + } + } + + Fault + GpuTLB::translateInt(RequestPtr req, ThreadContext *tc) + { + DPRINTF(GPUTLB, "Addresses references internal memory.\n"); + Addr vaddr = req->getVaddr(); + Addr prefix = (vaddr >> 3) & IntAddrPrefixMask; + + if (prefix == IntAddrPrefixCPUID) { + panic("CPUID memory space not yet implemented!\n"); + } else if (prefix == IntAddrPrefixMSR) { + vaddr = vaddr >> 3; + req->setFlags(Request::MMAPPED_IPR); + Addr regNum = 0; + + switch (vaddr & ~IntAddrPrefixMask) { + case 0x10: + regNum = MISCREG_TSC; + break; + case 0x1B: + regNum = MISCREG_APIC_BASE; + break; + case 0xFE: + regNum = MISCREG_MTRRCAP; + break; + case 0x174: + regNum = MISCREG_SYSENTER_CS; + break; + case 0x175: + regNum = MISCREG_SYSENTER_ESP; + break; + case 0x176: + regNum = MISCREG_SYSENTER_EIP; + break; + case 0x179: + regNum = MISCREG_MCG_CAP; + break; + case 0x17A: + regNum = MISCREG_MCG_STATUS; + break; + case 0x17B: + regNum = MISCREG_MCG_CTL; + break; + case 0x1D9: + regNum = MISCREG_DEBUG_CTL_MSR; + break; + case 0x1DB: + regNum = MISCREG_LAST_BRANCH_FROM_IP; + break; + case 0x1DC: + regNum = MISCREG_LAST_BRANCH_TO_IP; + break; + case 0x1DD: + regNum = MISCREG_LAST_EXCEPTION_FROM_IP; + break; + case 0x1DE: + regNum = MISCREG_LAST_EXCEPTION_TO_IP; + break; + case 0x200: + regNum = MISCREG_MTRR_PHYS_BASE_0; + break; + case 0x201: + regNum = MISCREG_MTRR_PHYS_MASK_0; + break; + case 0x202: + regNum = MISCREG_MTRR_PHYS_BASE_1; + break; + case 0x203: + regNum = MISCREG_MTRR_PHYS_MASK_1; + break; + case 0x204: + regNum = MISCREG_MTRR_PHYS_BASE_2; + break; + case 0x205: + regNum = MISCREG_MTRR_PHYS_MASK_2; + break; + case 0x206: + regNum = MISCREG_MTRR_PHYS_BASE_3; + break; + case 0x207: + regNum = MISCREG_MTRR_PHYS_MASK_3; + break; + case 0x208: + regNum = MISCREG_MTRR_PHYS_BASE_4; + break; + case 0x209: + regNum = MISCREG_MTRR_PHYS_MASK_4; + break; + case 0x20A: + regNum = MISCREG_MTRR_PHYS_BASE_5; + break; + case 0x20B: + regNum = MISCREG_MTRR_PHYS_MASK_5; + break; + case 0x20C: + regNum = MISCREG_MTRR_PHYS_BASE_6; + break; + case 0x20D: + regNum = MISCREG_MTRR_PHYS_MASK_6; + break; + case 0x20E: + regNum = MISCREG_MTRR_PHYS_BASE_7; + break; + case 0x20F: + regNum = MISCREG_MTRR_PHYS_MASK_7; + break; + case 0x250: + regNum = MISCREG_MTRR_FIX_64K_00000; + break; + case 0x258: + regNum = MISCREG_MTRR_FIX_16K_80000; + break; + case 0x259: + regNum = MISCREG_MTRR_FIX_16K_A0000; + break; + case 0x268: + regNum = MISCREG_MTRR_FIX_4K_C0000; + break; + case 0x269: + regNum = MISCREG_MTRR_FIX_4K_C8000; + break; + case 0x26A: + regNum = MISCREG_MTRR_FIX_4K_D0000; + break; + case 0x26B: + regNum = MISCREG_MTRR_FIX_4K_D8000; + break; + case 0x26C: + regNum = MISCREG_MTRR_FIX_4K_E0000; + break; + case 0x26D: + regNum = MISCREG_MTRR_FIX_4K_E8000; + break; + case 0x26E: + regNum = MISCREG_MTRR_FIX_4K_F0000; + break; + case 0x26F: + regNum = MISCREG_MTRR_FIX_4K_F8000; + break; + case 0x277: + regNum = MISCREG_PAT; + break; + case 0x2FF: + regNum = MISCREG_DEF_TYPE; + break; + case 0x400: + regNum = MISCREG_MC0_CTL; + break; + case 0x404: + regNum = MISCREG_MC1_CTL; + break; + case 0x408: + regNum = MISCREG_MC2_CTL; + break; + case 0x40C: + regNum = MISCREG_MC3_CTL; + break; + case 0x410: + regNum = MISCREG_MC4_CTL; + break; + case 0x414: + regNum = MISCREG_MC5_CTL; + break; + case 0x418: + regNum = MISCREG_MC6_CTL; + break; + case 0x41C: + regNum = MISCREG_MC7_CTL; + break; + case 0x401: + regNum = MISCREG_MC0_STATUS; + break; + case 0x405: + regNum = MISCREG_MC1_STATUS; + break; + case 0x409: + regNum = MISCREG_MC2_STATUS; + break; + case 0x40D: + regNum = MISCREG_MC3_STATUS; + break; + case 0x411: + regNum = MISCREG_MC4_STATUS; + break; + case 0x415: + regNum = MISCREG_MC5_STATUS; + break; + case 0x419: + regNum = MISCREG_MC6_STATUS; + break; + case 0x41D: + regNum = MISCREG_MC7_STATUS; + break; + case 0x402: + regNum = MISCREG_MC0_ADDR; + break; + case 0x406: + regNum = MISCREG_MC1_ADDR; + break; + case 0x40A: + regNum = MISCREG_MC2_ADDR; + break; + case 0x40E: + regNum = MISCREG_MC3_ADDR; + break; + case 0x412: + regNum = MISCREG_MC4_ADDR; + break; + case 0x416: + regNum = MISCREG_MC5_ADDR; + break; + case 0x41A: + regNum = MISCREG_MC6_ADDR; + break; + case 0x41E: + regNum = MISCREG_MC7_ADDR; + break; + case 0x403: + regNum = MISCREG_MC0_MISC; + break; + case 0x407: + regNum = MISCREG_MC1_MISC; + break; + case 0x40B: + regNum = MISCREG_MC2_MISC; + break; + case 0x40F: + regNum = MISCREG_MC3_MISC; + break; + case 0x413: + regNum = MISCREG_MC4_MISC; + break; + case 0x417: + regNum = MISCREG_MC5_MISC; + break; + case 0x41B: + regNum = MISCREG_MC6_MISC; + break; + case 0x41F: + regNum = MISCREG_MC7_MISC; + break; + case 0xC0000080: + regNum = MISCREG_EFER; + break; + case 0xC0000081: + regNum = MISCREG_STAR; + break; + case 0xC0000082: + regNum = MISCREG_LSTAR; + break; + case 0xC0000083: + regNum = MISCREG_CSTAR; + break; + case 0xC0000084: + regNum = MISCREG_SF_MASK; + break; + case 0xC0000100: + regNum = MISCREG_FS_BASE; + break; + case 0xC0000101: + regNum = MISCREG_GS_BASE; + break; + case 0xC0000102: + regNum = MISCREG_KERNEL_GS_BASE; + break; + case 0xC0000103: + regNum = MISCREG_TSC_AUX; + break; + case 0xC0010000: + regNum = MISCREG_PERF_EVT_SEL0; + break; + case 0xC0010001: + regNum = MISCREG_PERF_EVT_SEL1; + break; + case 0xC0010002: + regNum = MISCREG_PERF_EVT_SEL2; + break; + case 0xC0010003: + regNum = MISCREG_PERF_EVT_SEL3; + break; + case 0xC0010004: + regNum = MISCREG_PERF_EVT_CTR0; + break; + case 0xC0010005: + regNum = MISCREG_PERF_EVT_CTR1; + break; + case 0xC0010006: + regNum = MISCREG_PERF_EVT_CTR2; + break; + case 0xC0010007: + regNum = MISCREG_PERF_EVT_CTR3; + break; + case 0xC0010010: + regNum = MISCREG_SYSCFG; + break; + case 0xC0010016: + regNum = MISCREG_IORR_BASE0; + break; + case 0xC0010017: + regNum = MISCREG_IORR_BASE1; + break; + case 0xC0010018: + regNum = MISCREG_IORR_MASK0; + break; + case 0xC0010019: + regNum = MISCREG_IORR_MASK1; + break; + case 0xC001001A: + regNum = MISCREG_TOP_MEM; + break; + case 0xC001001D: + regNum = MISCREG_TOP_MEM2; + break; + case 0xC0010114: + regNum = MISCREG_VM_CR; + break; + case 0xC0010115: + regNum = MISCREG_IGNNE; + break; + case 0xC0010116: + regNum = MISCREG_SMM_CTL; + break; + case 0xC0010117: + regNum = MISCREG_VM_HSAVE_PA; + break; + default: + return std::make_shared(0); + } + //The index is multiplied by the size of a MiscReg so that + //any memory dependence calculations will not see these as + //overlapping. + req->setPaddr(regNum * sizeof(MiscReg)); + return NoFault; + } else if (prefix == IntAddrPrefixIO) { + // TODO If CPL > IOPL or in virtual mode, check the I/O permission + // bitmap in the TSS. + + Addr IOPort = vaddr & ~IntAddrPrefixMask; + // Make sure the address fits in the expected 16 bit IO address + // space. + assert(!(IOPort & ~0xFFFF)); + + if (IOPort == 0xCF8 && req->getSize() == 4) { + req->setFlags(Request::MMAPPED_IPR); + req->setPaddr(MISCREG_PCI_CONFIG_ADDRESS * sizeof(MiscReg)); + } else if ((IOPort & ~mask(2)) == 0xCFC) { + req->setFlags(Request::UNCACHEABLE); + + Addr configAddress = + tc->readMiscRegNoEffect(MISCREG_PCI_CONFIG_ADDRESS); + + if (bits(configAddress, 31, 31)) { + req->setPaddr(PhysAddrPrefixPciConfig | + mbits(configAddress, 30, 2) | + (IOPort & mask(2))); + } else { + req->setPaddr(PhysAddrPrefixIO | IOPort); + } + } else { + req->setFlags(Request::UNCACHEABLE); + req->setPaddr(PhysAddrPrefixIO | IOPort); + } + return NoFault; + } else { + panic("Access to unrecognized internal address space %#x.\n", + prefix); + } + } + + /** + * TLB_lookup will only perform a TLB lookup returning true on a TLB hit + * and false on a TLB miss. + * Many of the checks about different modes have been converted to + * assertions, since these parts of the code are not really used. + * On a hit it will update the LRU stack. + */ + bool + GpuTLB::tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats) + { + bool tlb_hit = false; + #ifndef NDEBUG + uint32_t flags = req->getFlags(); + int seg = flags & SegmentFlagMask; + #endif + + assert(seg != SEGMENT_REG_MS); + Addr vaddr = req->getVaddr(); + DPRINTF(GPUTLB, "TLB Lookup for vaddr %#x.\n", vaddr); + HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG); + + if (m5Reg.prot) { + DPRINTF(GPUTLB, "In protected mode.\n"); + // make sure we are in 64-bit mode + assert(m5Reg.mode == LongMode); + + // If paging is enabled, do the translation. + if (m5Reg.paging) { + DPRINTF(GPUTLB, "Paging enabled.\n"); + //update LRU stack on a hit + GpuTlbEntry *entry = lookup(vaddr, true); + + if (entry) + tlb_hit = true; + + if (!update_stats) { + // functional tlb access for memory initialization + // i.e., memory seeding or instr. seeding -> don't update + // TLB and stats + return tlb_hit; + } + + localNumTLBAccesses++; + + if (!entry) { + localNumTLBMisses++; + } else { + localNumTLBHits++; + } + } + } + + return tlb_hit; + } + + Fault + GpuTLB::translate(RequestPtr req, ThreadContext *tc, + Translation *translation, Mode mode, + bool &delayedResponse, bool timing, int &latency) + { + uint32_t flags = req->getFlags(); + int seg = flags & SegmentFlagMask; + bool storeCheck = flags & (StoreCheck << FlagShift); + + // If this is true, we're dealing with a request + // to a non-memory address space. + if (seg == SEGMENT_REG_MS) { + return translateInt(req, tc); + } + + delayedResponse = false; + Addr vaddr = req->getVaddr(); + DPRINTF(GPUTLB, "Translating vaddr %#x.\n", vaddr); + + HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG); + + // If protected mode has been enabled... + if (m5Reg.prot) { + DPRINTF(GPUTLB, "In protected mode.\n"); + // If we're not in 64-bit mode, do protection/limit checks + if (m5Reg.mode != LongMode) { + DPRINTF(GPUTLB, "Not in long mode. Checking segment " + "protection.\n"); + + // Check for a null segment selector. + if (!(seg == SEGMENT_REG_TSG || seg == SYS_SEGMENT_REG_IDTR || + seg == SEGMENT_REG_HS || seg == SEGMENT_REG_LS) + && !tc->readMiscRegNoEffect(MISCREG_SEG_SEL(seg))) { + return std::make_shared(0); + } + + bool expandDown = false; + SegAttr attr = tc->readMiscRegNoEffect(MISCREG_SEG_ATTR(seg)); + + if (seg >= SEGMENT_REG_ES && seg <= SEGMENT_REG_HS) { + if (!attr.writable && (mode == BaseTLB::Write || + storeCheck)) + return std::make_shared(0); + + if (!attr.readable && mode == BaseTLB::Read) + return std::make_shared(0); + + expandDown = attr.expandDown; + + } + + Addr base = tc->readMiscRegNoEffect(MISCREG_SEG_BASE(seg)); + Addr limit = tc->readMiscRegNoEffect(MISCREG_SEG_LIMIT(seg)); + // This assumes we're not in 64 bit mode. If we were, the + // default address size is 64 bits, overridable to 32. + int size = 32; + bool sizeOverride = (flags & (AddrSizeFlagBit << FlagShift)); + SegAttr csAttr = tc->readMiscRegNoEffect(MISCREG_CS_ATTR); + + if ((csAttr.defaultSize && sizeOverride) || + (!csAttr.defaultSize && !sizeOverride)) { + size = 16; + } + + Addr offset = bits(vaddr - base, size - 1, 0); + Addr endOffset = offset + req->getSize() - 1; + + if (expandDown) { + DPRINTF(GPUTLB, "Checking an expand down segment.\n"); + warn_once("Expand down segments are untested.\n"); + + if (offset <= limit || endOffset <= limit) + return std::make_shared(0); + } else { + if (offset > limit || endOffset > limit) + return std::make_shared(0); + } + } + + // If paging is enabled, do the translation. + if (m5Reg.paging) { + DPRINTF(GPUTLB, "Paging enabled.\n"); + // The vaddr already has the segment base applied. + GpuTlbEntry *entry = lookup(vaddr); + localNumTLBAccesses++; + + if (!entry) { + localNumTLBMisses++; + if (timing) { + latency = missLatency1; + } + + if (FullSystem) { + fatal("GpuTLB doesn't support full-system mode\n"); + } else { + DPRINTF(GPUTLB, "Handling a TLB miss for address %#x " + "at pc %#x.\n", vaddr, tc->instAddr()); + + Process *p = tc->getProcessPtr(); + GpuTlbEntry newEntry; + bool success = p->pTable->lookup(vaddr, newEntry); + + if (!success && mode != BaseTLB::Execute) { + // penalize a "page fault" more + if (timing) { + latency += missLatency2; + } + + if (p->fixupStackFault(vaddr)) + success = p->pTable->lookup(vaddr, newEntry); + } + + if (!success) { + return std::make_shared(vaddr, true, + mode, true, + false); + } else { + newEntry.valid = success; + Addr alignedVaddr = p->pTable->pageAlign(vaddr); + + DPRINTF(GPUTLB, "Mapping %#x to %#x\n", + alignedVaddr, newEntry.pageStart()); + + entry = insert(alignedVaddr, newEntry); + } + + DPRINTF(GPUTLB, "Miss was serviced.\n"); + } + } else { + localNumTLBHits++; + + if (timing) { + latency = hitLatency; + } + } + + // Do paging protection checks. + bool inUser = (m5Reg.cpl == 3 && + !(flags & (CPL0FlagBit << FlagShift))); + + CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0); + bool badWrite = (!entry->writable && (inUser || cr0.wp)); + + if ((inUser && !entry->user) || (mode == BaseTLB::Write && + badWrite)) { + // The page must have been present to get into the TLB in + // the first place. We'll assume the reserved bits are + // fine even though we're not checking them. + return std::make_shared(vaddr, true, mode, + inUser, false); + } + + if (storeCheck && badWrite) { + // This would fault if this were a write, so return a page + // fault that reflects that happening. + return std::make_shared(vaddr, true, + BaseTLB::Write, + inUser, false); + } + + + DPRINTF(GPUTLB, "Entry found with paddr %#x, doing protection " + "checks.\n", entry->paddr); + + int page_size = entry->size(); + Addr paddr = entry->paddr | (vaddr & (page_size - 1)); + DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr); + req->setPaddr(paddr); + + if (entry->uncacheable) + req->setFlags(Request::UNCACHEABLE); + } else { + //Use the address which already has segmentation applied. + DPRINTF(GPUTLB, "Paging disabled.\n"); + DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr); + req->setPaddr(vaddr); + } + } else { + // Real mode + DPRINTF(GPUTLB, "In real mode.\n"); + DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr); + req->setPaddr(vaddr); + } + + // Check for an access to the local APIC + if (FullSystem) { + LocalApicBase localApicBase = + tc->readMiscRegNoEffect(MISCREG_APIC_BASE); + + Addr baseAddr = localApicBase.base * PageBytes; + Addr paddr = req->getPaddr(); + + if (baseAddr <= paddr && baseAddr + PageBytes > paddr) { + // Force the access to be uncacheable. + req->setFlags(Request::UNCACHEABLE); + req->setPaddr(x86LocalAPICAddress(tc->contextId(), + paddr - baseAddr)); + } + } + + return NoFault; + }; + + Fault + GpuTLB::translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode, + int &latency) + { + bool delayedResponse; + + return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false, + latency); + } + + void + GpuTLB::translateTiming(RequestPtr req, ThreadContext *tc, + Translation *translation, Mode mode, int &latency) + { + bool delayedResponse; + assert(translation); + + Fault fault = GpuTLB::translate(req, tc, translation, mode, + delayedResponse, true, latency); + + if (!delayedResponse) + translation->finish(fault, req, tc, mode); + } + + Walker* + GpuTLB::getWalker() + { + return walker; + } + + + void + GpuTLB::serialize(CheckpointOut &cp) const + { + } + + void + GpuTLB::unserialize(CheckpointIn &cp) + { + } + + void + GpuTLB::regStats() + { + localNumTLBAccesses + .name(name() + ".local_TLB_accesses") + .desc("Number of TLB accesses") + ; + + localNumTLBHits + .name(name() + ".local_TLB_hits") + .desc("Number of TLB hits") + ; + + localNumTLBMisses + .name(name() + ".local_TLB_misses") + .desc("Number of TLB misses") + ; + + localTLBMissRate + .name(name() + ".local_TLB_miss_rate") + .desc("TLB miss rate") + ; + + accessCycles + .name(name() + ".access_cycles") + .desc("Cycles spent accessing this TLB level") + ; + + pageTableCycles + .name(name() + ".page_table_cycles") + .desc("Cycles spent accessing the page table") + ; + + localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses; + + numUniquePages + .name(name() + ".unique_pages") + .desc("Number of unique pages touched") + ; + + localCycles + .name(name() + ".local_cycles") + .desc("Number of cycles spent in queue for all incoming reqs") + ; + + localLatency + .name(name() + ".local_latency") + .desc("Avg. latency over incoming coalesced reqs") + ; + + localLatency = localCycles / localNumTLBAccesses; + + globalNumTLBAccesses + .name(name() + ".global_TLB_accesses") + .desc("Number of TLB accesses") + ; + + globalNumTLBHits + .name(name() + ".global_TLB_hits") + .desc("Number of TLB hits") + ; + + globalNumTLBMisses + .name(name() + ".global_TLB_misses") + .desc("Number of TLB misses") + ; + + globalTLBMissRate + .name(name() + ".global_TLB_miss_rate") + .desc("TLB miss rate") + ; + + globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses; + + avgReuseDistance + .name(name() + ".avg_reuse_distance") + .desc("avg. reuse distance over all pages (in ticks)") + ; + + } + + /** + * Do the TLB lookup for this coalesced request and schedule + * another event cycles later. + */ + + void + GpuTLB::issueTLBLookup(PacketPtr pkt) + { + assert(pkt); + assert(pkt->senderState); + + Addr virt_page_addr = roundDown(pkt->req->getVaddr(), + TheISA::PageBytes); + + TranslationState *sender_state = + safe_cast(pkt->senderState); + + bool update_stats = !sender_state->prefetch; + ThreadContext * tmp_tc = sender_state->tc; + + DPRINTF(GPUTLB, "Translation req. for virt. page addr %#x\n", + virt_page_addr); + + int req_cnt = sender_state->reqCnt.back(); + + if (update_stats) { + accessCycles -= (curTick() * req_cnt); + localCycles -= curTick(); + updatePageFootprint(virt_page_addr); + globalNumTLBAccesses += req_cnt; + } + + tlbOutcome lookup_outcome = TLB_MISS; + RequestPtr tmp_req = pkt->req; + + // Access the TLB and figure out if it's a hit or a miss. + bool success = tlbLookup(tmp_req, tmp_tc, update_stats); + + if (success) { + lookup_outcome = TLB_HIT; + // Put the entry in SenderState + GpuTlbEntry *entry = lookup(tmp_req->getVaddr(), false); + assert(entry); + + sender_state->tlbEntry = + new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid); + + if (update_stats) { + // the reqCnt has an entry per level, so its size tells us + // which level we are in + sender_state->hitLevel = sender_state->reqCnt.size(); + globalNumTLBHits += req_cnt; + } + } else { + if (update_stats) + globalNumTLBMisses += req_cnt; + } + + /* + * We now know the TLB lookup outcome (if it's a hit or a miss), as well + * as the TLB access latency. + * + * We create and schedule a new TLBEvent which will help us take the + * appropriate actions (e.g., update TLB on a hit, send request to lower + * level TLB on a miss, or start a page walk if this was the last-level + * TLB) + */ + TLBEvent *tlb_event = + new TLBEvent(this, virt_page_addr, lookup_outcome, pkt); + + if (translationReturnEvent.count(virt_page_addr)) { + panic("Virtual Page Address %#x already has a return event\n", + virt_page_addr); + } + + translationReturnEvent[virt_page_addr] = tlb_event; + assert(tlb_event); + + DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n", + curTick() + this->ticks(hitLatency)); + + schedule(tlb_event, curTick() + this->ticks(hitLatency)); + } + + GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome, + PacketPtr _pkt) + : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr), + outcome(tlb_outcome), pkt(_pkt) + { + } + + /** + * Do Paging protection checks. If we encounter a page fault, then + * an assertion is fired. + */ + void + GpuTLB::pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt, + GpuTlbEntry * tlb_entry, Mode mode) + { + HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG); + uint32_t flags = pkt->req->getFlags(); + bool storeCheck = flags & (StoreCheck << FlagShift); + + // Do paging protection checks. + bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift))); + CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0); + + bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp)); + + if ((inUser && !tlb_entry->user) || + (mode == BaseTLB::Write && badWrite)) { + // The page must have been present to get into the TLB in + // the first place. We'll assume the reserved bits are + // fine even though we're not checking them. + assert(false); + } + + if (storeCheck && badWrite) { + // This would fault if this were a write, so return a page + // fault that reflects that happening. + assert(false); + } + } + + /** + * handleTranslationReturn is called on a TLB hit, + * when a TLB miss returns or when a page fault returns. + * The latter calls handelHit with TLB miss as tlbOutcome. + */ + void + GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome, + PacketPtr pkt) + { + + assert(pkt); + Addr vaddr = pkt->req->getVaddr(); + + TranslationState *sender_state = + safe_cast(pkt->senderState); + + ThreadContext *tc = sender_state->tc; + Mode mode = sender_state->tlbMode; + + GpuTlbEntry *local_entry, *new_entry; + + if (tlb_outcome == TLB_HIT) { + DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr); + local_entry = sender_state->tlbEntry; + } else { + DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n", + vaddr); + + // We are returning either from a page walk or from a hit at a lower + // TLB level. The senderState should be "carrying" a pointer to the + // correct TLBEntry. + new_entry = sender_state->tlbEntry; + assert(new_entry); + local_entry = new_entry; + + if (allocationPolicy) { + DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n", + virt_page_addr); + + local_entry = insert(virt_page_addr, *new_entry); + } + + assert(local_entry); + } + + /** + * At this point the packet carries an up-to-date tlbEntry pointer + * in its senderState. + * Next step is to do the paging protection checks. + */ + DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks " + "while paddr was %#x.\n", local_entry->vaddr, + local_entry->paddr); + + pagingProtectionChecks(tc, pkt, local_entry, mode); + int page_size = local_entry->size(); + Addr paddr = local_entry->paddr | (vaddr & (page_size - 1)); + DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr); + + // Since this packet will be sent through the cpu side slave port, + // it must be converted to a response pkt if it is not one already + if (pkt->isRequest()) { + pkt->makeTimingResponse(); + } + + pkt->req->setPaddr(paddr); + + if (local_entry->uncacheable) { + pkt->req->setFlags(Request::UNCACHEABLE); + } + + //send packet back to coalescer + cpuSidePort[0]->sendTimingResp(pkt); + //schedule cleanup event + cleanupQueue.push(virt_page_addr); + + // schedule this only once per cycle. + // The check is required because we might have multiple translations + // returning the same cycle + // this is a maximum priority event and must be on the same cycle + // as the cleanup event in TLBCoalescer to avoid a race with + // IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry + if (!cleanupEvent.scheduled()) + schedule(cleanupEvent, curTick()); + } + + /** + * Here we take the appropriate actions based on the result of the + * TLB lookup. + */ + void + GpuTLB::translationReturn(Addr virtPageAddr, tlbOutcome outcome, + PacketPtr pkt) + { + DPRINTF(GPUTLB, "Triggered TLBEvent for addr %#x\n", virtPageAddr); + + assert(translationReturnEvent[virtPageAddr]); + assert(pkt); + + TranslationState *tmp_sender_state = + safe_cast(pkt->senderState); + + int req_cnt = tmp_sender_state->reqCnt.back(); + bool update_stats = !tmp_sender_state->prefetch; + + + if (outcome == TLB_HIT) { + handleTranslationReturn(virtPageAddr, TLB_HIT, pkt); + + if (update_stats) { + accessCycles += (req_cnt * curTick()); + localCycles += curTick(); + } + + } else if (outcome == TLB_MISS) { + + DPRINTF(GPUTLB, "This is a TLB miss\n"); + if (update_stats) { + accessCycles += (req_cnt*curTick()); + localCycles += curTick(); + } + + if (hasMemSidePort) { + // the one cyle added here represent the delay from when we get + // the reply back till when we propagate it to the coalescer + // above. + if (update_stats) { + accessCycles += (req_cnt * 1); + localCycles += 1; + } + + /** + * There is a TLB below. Send the coalesced request. + * We actually send the very first packet of all the + * pending packets for this virtual page address. + */ + if (!memSidePort[0]->sendTimingReq(pkt)) { + DPRINTF(GPUTLB, "Failed sending translation request to " + "lower level TLB for addr %#x\n", virtPageAddr); + + memSidePort[0]->retries.push_back(pkt); + } else { + DPRINTF(GPUTLB, "Sent translation request to lower level " + "TLB for addr %#x\n", virtPageAddr); + } + } else { + //this is the last level TLB. Start a page walk + DPRINTF(GPUTLB, "Last level TLB - start a page walk for " + "addr %#x\n", virtPageAddr); + + if (update_stats) + pageTableCycles -= (req_cnt*curTick()); + + TLBEvent *tlb_event = translationReturnEvent[virtPageAddr]; + assert(tlb_event); + tlb_event->updateOutcome(PAGE_WALK); + schedule(tlb_event, curTick() + ticks(missLatency2)); + } + } else if (outcome == PAGE_WALK) { + if (update_stats) + pageTableCycles += (req_cnt*curTick()); + + // Need to access the page table and update the TLB + DPRINTF(GPUTLB, "Doing a page walk for address %#x\n", + virtPageAddr); + + TranslationState *sender_state = + safe_cast(pkt->senderState); + + Process *p = sender_state->tc->getProcessPtr(); + TlbEntry newEntry; + Addr vaddr = pkt->req->getVaddr(); + #ifndef NDEBUG + Addr alignedVaddr = p->pTable->pageAlign(vaddr); + assert(alignedVaddr == virtPageAddr); + #endif + bool success; + success = p->pTable->lookup(vaddr, newEntry); + if (!success && sender_state->tlbMode != BaseTLB::Execute) { + if (p->fixupStackFault(vaddr)) { + success = p->pTable->lookup(vaddr, newEntry); + } + } + + DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr, + newEntry.pageStart()); + + sender_state->tlbEntry = + new GpuTlbEntry(0, newEntry.vaddr, newEntry.paddr, success); + + handleTranslationReturn(virtPageAddr, TLB_MISS, pkt); + } else if (outcome == MISS_RETURN) { + /** we add an extra cycle in the return path of the translation + * requests in between the various TLB levels. + */ + handleTranslationReturn(virtPageAddr, TLB_MISS, pkt); + } else { + assert(false); + } + } + + void + GpuTLB::TLBEvent::process() + { + tlb->translationReturn(virtPageAddr, outcome, pkt); + } + + const char* + GpuTLB::TLBEvent::description() const + { + return "trigger translationDoneEvent"; + } + + void + GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome) + { + outcome = _outcome; + } + + Addr + GpuTLB::TLBEvent::getTLBEventVaddr() + { + return virtPageAddr; + } + + /* + * recvTiming receives a coalesced timing request from a TLBCoalescer + * and it calls issueTLBLookup() + * It only rejects the packet if we have exceeded the max + * outstanding number of requests for the TLB + */ + bool + GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt) + { + if (tlb->outstandingReqs < tlb->maxCoalescedReqs) { + tlb->issueTLBLookup(pkt); + // update number of outstanding translation requests + tlb->outstandingReqs++; + return true; + } else { + DPRINTF(GPUTLB, "Reached maxCoalescedReqs number %d\n", + tlb->outstandingReqs); + return false; + } + } + + /** + * handleFuncTranslationReturn is called on a TLB hit, + * when a TLB miss returns or when a page fault returns. + * It updates LRU, inserts the TLB entry on a miss + * depending on the allocation policy and does the required + * protection checks. It does NOT create a new packet to + * update the packet's addr; this is done in hsail-gpu code. + */ + void + GpuTLB::handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome tlb_outcome) + { + TranslationState *sender_state = + safe_cast(pkt->senderState); + + ThreadContext *tc = sender_state->tc; + Mode mode = sender_state->tlbMode; + Addr vaddr = pkt->req->getVaddr(); + + GpuTlbEntry *local_entry, *new_entry; + + if (tlb_outcome == TLB_HIT) { + DPRINTF(GPUTLB, "Functional Translation Done - TLB hit for addr " + "%#x\n", vaddr); + + local_entry = sender_state->tlbEntry; + } else { + DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr " + "%#x\n", vaddr); + + // We are returning either from a page walk or from a hit at a lower + // TLB level. The senderState should be "carrying" a pointer to the + // correct TLBEntry. + new_entry = sender_state->tlbEntry; + assert(new_entry); + local_entry = new_entry; + + if (allocationPolicy) { + Addr virt_page_addr = roundDown(vaddr, TheISA::PageBytes); + + DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n", + virt_page_addr); + + local_entry = insert(virt_page_addr, *new_entry); + } + + assert(local_entry); + } + + DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks " + "while paddr was %#x.\n", local_entry->vaddr, + local_entry->paddr); + + // Do paging checks if it's a normal functional access. If it's for a + // prefetch, then sometimes you can try to prefetch something that won't + // pass protection. We don't actually want to fault becuase there is no + // demand access to deem this a violation. Just put it in the TLB and + // it will fault if indeed a future demand access touches it in + // violation. + if (!sender_state->prefetch && sender_state->tlbEntry->valid) + pagingProtectionChecks(tc, pkt, local_entry, mode); + + int page_size = local_entry->size(); + Addr paddr = local_entry->paddr | (vaddr & (page_size - 1)); + DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr); + + pkt->req->setPaddr(paddr); + + if (local_entry->uncacheable) + pkt->req->setFlags(Request::UNCACHEABLE); + } + + // This is used for atomic translations. Need to + // make it all happen during the same cycle. + void + GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt) + { + TranslationState *sender_state = + safe_cast(pkt->senderState); + + ThreadContext *tc = sender_state->tc; + bool update_stats = !sender_state->prefetch; + + Addr virt_page_addr = roundDown(pkt->req->getVaddr(), + TheISA::PageBytes); + + if (update_stats) + tlb->updatePageFootprint(virt_page_addr); + + // do the TLB lookup without updating the stats + bool success = tlb->tlbLookup(pkt->req, tc, update_stats); + tlbOutcome tlb_outcome = success ? TLB_HIT : TLB_MISS; + + // functional mode means no coalescing + // global metrics are the same as the local metrics + if (update_stats) { + tlb->globalNumTLBAccesses++; + + if (success) { + sender_state->hitLevel = sender_state->reqCnt.size(); + tlb->globalNumTLBHits++; + } + } + + if (!success) { + if (update_stats) + tlb->globalNumTLBMisses++; + if (tlb->hasMemSidePort) { + // there is a TLB below -> propagate down the TLB hierarchy + tlb->memSidePort[0]->sendFunctional(pkt); + // If no valid translation from a prefetch, then just return + if (sender_state->prefetch && !pkt->req->hasPaddr()) + return; + } else { + // Need to access the page table and update the TLB + DPRINTF(GPUTLB, "Doing a page walk for address %#x\n", + virt_page_addr); + + Process *p = tc->getProcessPtr(); + TlbEntry newEntry; + + Addr vaddr = pkt->req->getVaddr(); + #ifndef NDEBUG + Addr alignedVaddr = p->pTable->pageAlign(vaddr); + assert(alignedVaddr == virt_page_addr); + #endif + + bool success = p->pTable->lookup(vaddr, newEntry); + if (!success && sender_state->tlbMode != BaseTLB::Execute) { + if (p->fixupStackFault(vaddr)) + success = p->pTable->lookup(vaddr, newEntry); + } + + if (!sender_state->prefetch) { + // no PageFaults are permitted after + // the second page table lookup + assert(success); + + DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr, + newEntry.pageStart()); + + sender_state->tlbEntry = new GpuTlbEntry(0, newEntry.vaddr, + newEntry.paddr, + success); + } else { + // If this was a prefetch, then do the normal thing if it + // was a successful translation. Otherwise, send an empty + // TLB entry back so that it can be figured out as empty and + // handled accordingly. + if (success) { + DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr, + newEntry.pageStart()); + + sender_state->tlbEntry = new GpuTlbEntry(0, + newEntry.vaddr, + newEntry.paddr, + success); + } else { + DPRINTF(GPUPrefetch, "Prefetch failed %#x\n", + alignedVaddr); + + sender_state->tlbEntry = new GpuTlbEntry(); + + return; + } + } + } + } else { + DPRINTF(GPUPrefetch, "Functional Hit for vaddr %#x\n", + tlb->lookup(pkt->req->getVaddr())); + + GpuTlbEntry *entry = tlb->lookup(pkt->req->getVaddr(), + update_stats); + + assert(entry); + + sender_state->tlbEntry = + new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid); + } + // This is the function that would populate pkt->req with the paddr of + // the translation. But if no translation happens (i.e Prefetch fails) + // then the early returns in the above code wiill keep this function + // from executing. + tlb->handleFuncTranslationReturn(pkt, tlb_outcome); + } + + void + GpuTLB::CpuSidePort::recvReqRetry() + { + // The CPUSidePort never sends anything but replies. No retries + // expected. + assert(false); + } + + AddrRangeList + GpuTLB::CpuSidePort::getAddrRanges() const + { + // currently not checked by the master + AddrRangeList ranges; + + return ranges; + } + + /** + * MemSidePort receives the packet back. + * We need to call the handleTranslationReturn + * and propagate up the hierarchy. + */ + bool + GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt) + { + Addr virt_page_addr = roundDown(pkt->req->getVaddr(), + TheISA::PageBytes); + + DPRINTF(GPUTLB, "MemSidePort recvTiming for virt_page_addr %#x\n", + virt_page_addr); + + TLBEvent *tlb_event = tlb->translationReturnEvent[virt_page_addr]; + assert(tlb_event); + assert(virt_page_addr == tlb_event->getTLBEventVaddr()); + + tlb_event->updateOutcome(MISS_RETURN); + tlb->schedule(tlb_event, curTick()+tlb->ticks(1)); + + return true; + } + + void + GpuTLB::MemSidePort::recvReqRetry() + { + // No retries should reach the TLB. The retries + // should only reach the TLBCoalescer. + assert(false); + } + + void + GpuTLB::cleanup() + { + while (!cleanupQueue.empty()) { + Addr cleanup_addr = cleanupQueue.front(); + cleanupQueue.pop(); + + // delete TLBEvent + TLBEvent * old_tlb_event = translationReturnEvent[cleanup_addr]; + delete old_tlb_event; + translationReturnEvent.erase(cleanup_addr); + + // update number of outstanding requests + outstandingReqs--; + } + + /** the higher level coalescer should retry if it has + * any pending requests. + */ + for (int i = 0; i < cpuSidePort.size(); ++i) { + cpuSidePort[i]->sendRetryReq(); + } + } + + void + GpuTLB::updatePageFootprint(Addr virt_page_addr) + { + + std::pair ret; + + AccessInfo tmp_access_info; + tmp_access_info.lastTimeAccessed = 0; + tmp_access_info.accessesPerPage = 0; + tmp_access_info.totalReuseDistance = 0; + tmp_access_info.sumDistance = 0; + tmp_access_info.meanDistance = 0; + + ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr, + tmp_access_info)); + + bool first_page_access = ret.second; + + if (first_page_access) { + numUniquePages++; + } else { + int accessed_before; + accessed_before = curTick() - ret.first->second.lastTimeAccessed; + ret.first->second.totalReuseDistance += accessed_before; + } + + ret.first->second.accessesPerPage++; + ret.first->second.lastTimeAccessed = curTick(); + + if (accessDistance) { + ret.first->second.localTLBAccesses + .push_back(localNumTLBAccesses.value()); + } + } + + void + GpuTLB::exitCallback() + { + std::ostream *page_stat_file = nullptr; + + if (accessDistance) { + + // print per page statistics to a separate file (.csv format) + // simout is the gem5 output directory (default is m5out or the one + // specified with -d + page_stat_file = simout.create(name().c_str()); + + // print header + *page_stat_file << "page,max_access_distance,mean_access_distance, " + << "stddev_distance" << std::endl; + } + + // update avg. reuse distance footprint + AccessPatternTable::iterator iter, iter_begin, iter_end; + unsigned int sum_avg_reuse_distance_per_page = 0; + + // iterate through all pages seen by this TLB + for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) { + sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance / + iter->second.accessesPerPage; + + if (accessDistance) { + unsigned int tmp = iter->second.localTLBAccesses[0]; + unsigned int prev = tmp; + + for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) { + if (i) { + tmp = prev + 1; + } + + prev = iter->second.localTLBAccesses[i]; + // update the localTLBAccesses value + // with the actual differece + iter->second.localTLBAccesses[i] -= tmp; + // compute the sum of AccessDistance per page + // used later for mean + iter->second.sumDistance += + iter->second.localTLBAccesses[i]; + } + + iter->second.meanDistance = + iter->second.sumDistance / iter->second.accessesPerPage; + + // compute std_dev and max (we need a second round because we + // need to know the mean value + unsigned int max_distance = 0; + unsigned int stddev_distance = 0; + + for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) { + unsigned int tmp_access_distance = + iter->second.localTLBAccesses[i]; + + if (tmp_access_distance > max_distance) { + max_distance = tmp_access_distance; + } + + unsigned int diff = + tmp_access_distance - iter->second.meanDistance; + stddev_distance += pow(diff, 2); + + } + + stddev_distance = + sqrt(stddev_distance/iter->second.accessesPerPage); + + if (page_stat_file) { + *page_stat_file << std::hex << iter->first << ","; + *page_stat_file << std::dec << max_distance << ","; + *page_stat_file << std::dec << iter->second.meanDistance + << ","; + *page_stat_file << std::dec << stddev_distance; + *page_stat_file << std::endl; + } + + // erase the localTLBAccesses array + iter->second.localTLBAccesses.clear(); + } + } + + if (!TLBFootprint.empty()) { + avgReuseDistance = + sum_avg_reuse_distance_per_page / TLBFootprint.size(); + } + + //clear the TLBFootprint map + TLBFootprint.clear(); + } +} // namespace X86ISA + +X86ISA::GpuTLB* +X86GPUTLBParams::create() +{ + return new X86ISA::GpuTLB(this); +} + diff --git a/src/gpu-compute/gpu_tlb.hh b/src/gpu-compute/gpu_tlb.hh new file mode 100644 index 000000000..3549c598b --- /dev/null +++ b/src/gpu-compute/gpu_tlb.hh @@ -0,0 +1,465 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#ifndef __GPU_TLB_HH__ +#define __GPU_TLB_HH__ + +#include +#include +#include +#include +#include + +#include "arch/generic/tlb.hh" +#include "arch/x86/pagetable.hh" +#include "arch/x86/pagetable_walker.hh" +#include "arch/x86/regs/segment.hh" +#include "base/callback.hh" +#include "base/misc.hh" +#include "base/statistics.hh" +#include "gpu-compute/compute_unit.hh" +#include "mem/mem_object.hh" +#include "mem/port.hh" +#include "mem/request.hh" +#include "params/X86GPUTLB.hh" +#include "sim/sim_object.hh" + +class BaseTLB; +class Packet; +class ThreadContext; + +namespace X86ISA +{ + class GpuTlbEntry : public TlbEntry + { + public: + GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid) + : TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { } + + GpuTlbEntry() : TlbEntry() { } + + bool valid; + }; + + class GpuTLB : public MemObject + { + protected: + friend class Walker; + + typedef std::list EntryList; + + uint32_t configAddress; + + // TLB clock: will inherit clock from shader's clock period in terms + // of nuber of ticks of curTime (aka global simulation clock) + // The assignment of TLB clock from shader clock is done in the python + // config files. + int clock; + + public: + // clock related functions ; maps to-and-from Simulation ticks and + // object clocks. + Tick frequency() const { return SimClock::Frequency / clock; } + + Tick + ticks(int numCycles) const + { + return (Tick)clock * numCycles; + } + + Tick curCycle() const { return curTick() / clock; } + Tick tickToCycles(Tick val) const { return val / clock;} + + typedef X86GPUTLBParams Params; + GpuTLB(const Params *p); + ~GpuTLB(); + + typedef enum BaseTLB::Mode Mode; + + class Translation + { + public: + virtual ~Translation() { } + + /** + * Signal that the translation has been delayed due to a hw page + * table walk. + */ + virtual void markDelayed() = 0; + + /** + * The memory for this object may be dynamically allocated, and it + * may be responsible for cleaning itslef up which will happen in + * this function. Once it's called the object is no longer valid. + */ + virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc, + Mode mode) = 0; + }; + + void dumpAll(); + GpuTlbEntry *lookup(Addr va, bool update_lru=true); + void setConfigAddress(uint32_t addr); + + protected: + EntryList::iterator lookupIt(Addr va, bool update_lru=true); + Walker *walker; + + public: + Walker *getWalker(); + void invalidateAll(); + void invalidateNonGlobal(); + void demapPage(Addr va, uint64_t asn); + + protected: + int size; + int assoc; + int numSets; + + /** + * true if this is a fully-associative TLB + */ + bool FA; + Addr setMask; + + /** + * Allocation Policy: true if we always allocate on a hit, false + * otherwise. Default is true. + */ + bool allocationPolicy; + + /** + * if true, then this is not the last level TLB + */ + bool hasMemSidePort; + + /** + * Print out accessDistance stats. One stat file + * per TLB. + */ + bool accessDistance; + + GpuTlbEntry *tlb; + + /* + * It's a per-set list. As long as we have not reached + * the full capacity of the given set, grab an entry from + * the freeList. + */ + std::vector freeList; + + /** + * An entryList per set is the equivalent of an LRU stack; + * it's used to guide replacement decisions. The head of the list + * contains the MRU TLB entry of the given set. If the freeList + * for this set is empty, the last element of the list + * is evicted (i.e., dropped on the floor). + */ + std::vector entryList; + + Fault translateInt(RequestPtr req, ThreadContext *tc); + + Fault translate(RequestPtr req, ThreadContext *tc, + Translation *translation, Mode mode, bool &delayedResponse, + bool timing, int &latency); + + public: + // latencies for a TLB hit, miss and page fault + int hitLatency; + int missLatency1; + int missLatency2; + + // local_stats are as seen from the TLB + // without taking into account coalescing + Stats::Scalar localNumTLBAccesses; + Stats::Scalar localNumTLBHits; + Stats::Scalar localNumTLBMisses; + Stats::Formula localTLBMissRate; + + // global_stats are as seen from the + // CU's perspective taking into account + // all coalesced requests. + Stats::Scalar globalNumTLBAccesses; + Stats::Scalar globalNumTLBHits; + Stats::Scalar globalNumTLBMisses; + Stats::Formula globalTLBMissRate; + + // from the CU perspective (global) + Stats::Scalar accessCycles; + // from the CU perspective (global) + Stats::Scalar pageTableCycles; + Stats::Scalar numUniquePages; + // from the perspective of this TLB + Stats::Scalar localCycles; + // from the perspective of this TLB + Stats::Formula localLatency; + // I take the avg. per page and then + // the avg. over all pages. + Stats::Scalar avgReuseDistance; + + void regStats(); + void updatePageFootprint(Addr virt_page_addr); + void printAccessPattern(); + + + Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode, + int &latency); + + void translateTiming(RequestPtr req, ThreadContext *tc, + Translation *translation, Mode mode, + int &latency); + + Tick doMmuRegRead(ThreadContext *tc, Packet *pkt); + Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt); + + GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry); + + // Checkpointing + virtual void serialize(CheckpointOut& cp) const; + virtual void unserialize(CheckpointIn& cp); + void issueTranslation(); + enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN}; + bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats); + + void handleTranslationReturn(Addr addr, tlbOutcome outcome, + PacketPtr pkt); + + void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome); + + void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt, + GpuTlbEntry *tlb_entry, Mode mode); + + void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry, + Addr phys_page_addr); + + void issueTLBLookup(PacketPtr pkt); + + // CpuSidePort is the TLB Port closer to the CPU/CU side + class CpuSidePort : public SlavePort + { + public: + CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB, + PortID _index) + : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { } + + protected: + GpuTLB *tlb; + int index; + + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + virtual void recvRespRetry() { assert(false); } + virtual AddrRangeList getAddrRanges() const; + }; + + /** + * MemSidePort is the TLB Port closer to the memory side + * If this is a last level TLB then this port will not be connected. + * + * Future action item: if we ever do real page walks, then this port + * should be connected to a RubyPort. + */ + class MemSidePort : public MasterPort + { + public: + MemSidePort(const std::string &_name, GpuTLB * gpu_TLB, + PortID _index) + : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { } + + std::deque retries; + + protected: + GpuTLB *tlb; + int index; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + }; + + // TLB ports on the cpu Side + std::vector cpuSidePort; + // TLB ports on the memory side + std::vector memSidePort; + + BaseMasterPort &getMasterPort(const std::string &if_name, + PortID idx=InvalidPortID); + + BaseSlavePort &getSlavePort(const std::string &if_name, + PortID idx=InvalidPortID); + + /** + * TLB TranslationState: this currently is a somewhat bastardization of + * the usage of SenderState, whereby the receiver of a packet is not + * usually supposed to need to look at the contents of the senderState, + * you're really only supposed to look at what you pushed on, pop it + * off, and send it back. + * + * However, since there is state that we want to pass to the TLBs using + * the send/recv Timing/Functional/etc. APIs, which don't allow for new + * arguments, we need a common TLB senderState to pass between TLBs, + * both "forwards" and "backwards." + * + * So, basically, the rule is that any packet received by a TLB port + * (cpuside OR memside) must be safely castable to a TranslationState. + */ + + struct TranslationState : public Packet::SenderState + { + // TLB mode, read or write + Mode tlbMode; + // Thread context associated with this req + ThreadContext *tc; + + /* + * TLB entry to be populated and passed back and filled in + * previous TLBs. Equivalent to the data cache concept of + * "data return." + */ + GpuTlbEntry *tlbEntry; + // Is this a TLB prefetch request? + bool prefetch; + // When was the req for this translation issued + uint64_t issueTime; + // Remember where this came from + std::vectorports; + + // keep track of #uncoalesced reqs per packet per TLB level; + // reqCnt per level >= reqCnt higher level + std::vector reqCnt; + // TLB level this packet hit in; 0 if it hit in the page table + int hitLevel; + Packet::SenderState *saved; + + TranslationState(Mode tlb_mode, ThreadContext *_tc, + bool _prefetch=false, + Packet::SenderState *_saved=nullptr) + : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr), + prefetch(_prefetch), issueTime(0), + hitLevel(0),saved(_saved) { } + }; + + // maximum number of permitted coalesced requests per cycle + int maxCoalescedReqs; + + // Current number of outstandings coalesced requests. + // Should be <= maxCoalescedReqs + int outstandingReqs; + + /** + * A TLBEvent is scheduled after the TLB lookup and helps us take the + * appropriate actions: + * (e.g., update TLB on a hit, + * send request to lower level TLB on a miss, + * or start a page walk if this was the last-level TLB). + */ + void translationReturn(Addr virtPageAddr, tlbOutcome outcome, + PacketPtr pkt); + + class TLBEvent : public Event + { + private: + GpuTLB *tlb; + Addr virtPageAddr; + /** + * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK + */ + tlbOutcome outcome; + PacketPtr pkt; + + public: + TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome, + PacketPtr _pkt); + + void process(); + const char *description() const; + + // updateOutcome updates the tlbOutcome of a TLBEvent + void updateOutcome(tlbOutcome _outcome); + Addr getTLBEventVaddr(); + }; + + std::unordered_map translationReturnEvent; + + // this FIFO queue keeps track of the virt. page addresses + // that are pending cleanup + std::queue cleanupQueue; + + // the cleanupEvent is scheduled after a TLBEvent triggers in order to + // free memory and do the required clean-up + void cleanup(); + + EventWrapper cleanupEvent; + + /** + * This hash map will use the virtual page address as a key + * and will keep track of total number of accesses per page + */ + + struct AccessInfo + { + unsigned int lastTimeAccessed; // last access to this page + unsigned int accessesPerPage; + // need to divide it by accessesPerPage at the end + unsigned int totalReuseDistance; + + /** + * The field below will help us compute the access distance, + * that is the number of (coalesced) TLB accesses that + * happened in between each access to this page + * + * localTLBAccesses[x] is the value of localTLBNumAccesses + * when the page was accessed for the th time + */ + std::vector localTLBAccesses; + unsigned int sumDistance; + unsigned int meanDistance; + }; + + typedef std::unordered_map AccessPatternTable; + AccessPatternTable TLBFootprint; + + // Called at the end of simulation to dump page access stats. + void exitCallback(); + + EventWrapper exitEvent; + }; +} + +#endif // __GPU_TLB_HH__ diff --git a/src/gpu-compute/hsa_code.hh b/src/gpu-compute/hsa_code.hh new file mode 100644 index 000000000..9f358e23c --- /dev/null +++ b/src/gpu-compute/hsa_code.hh @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __HSA_CODE_HH__ +#define __HSA_CODE_HH__ + +#include +#include + +#include "arch/gpu_types.hh" +#include "config/the_gpu_isa.hh" + +class HsaKernelInfo; + +/* @class HsaCode + * base code object for the set of HSA kernels associated + * with a single application. this class provides the common + * methods for creating, accessing, and storing information + * about kernel and variable symbols, symbol name, memory + * segment sizes, and instruction count, etc. + */ + +class HsaCode +{ + public: + HsaCode(const std::string &name) : readonly_data(nullptr), funcarg_size(0), + _name(name) + { + } + + enum class MemorySegment { + NONE, + FLAT, + GLOBAL, + READONLY, + KERNARG, + GROUP, + PRIVATE, + SPILL, + ARG, + EXTSPACE0 + }; + + const std::string& name() const { return _name; } + int numInsts() const { return _insts.size(); } + std::vector* insts() { return &_insts; } + + void + setReadonlyData(uint8_t *_readonly_data) + { + readonly_data = _readonly_data; + } + + virtual int getSize(MemorySegment segment) const = 0; + virtual void generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const = 0; + + uint8_t *readonly_data; + int funcarg_size; + + protected: + // An array that stores instruction indices (0 through kernel size) + // for a kernel passed to code object constructor as an argument. + std::vector _insts; + + private: + const std::string _name; +}; + +#endif // __HSA_CODE_HH__ diff --git a/src/gpu-compute/hsa_kernel_info.hh b/src/gpu-compute/hsa_kernel_info.hh new file mode 100644 index 000000000..396913dac --- /dev/null +++ b/src/gpu-compute/hsa_kernel_info.hh @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __HSA_KERNEL_INFO_HH__ +#define __HSA_KERNEL_INFO_HH__ + +// This file defines the public interface between the HSA emulated +// driver and application programs. + +#include + +static const int HSA_GET_SIZES = 0x4801; +static const int HSA_GET_KINFO = 0x4802; +static const int HSA_GET_STRINGS = 0x4803; +static const int HSA_GET_CODE = 0x4804; +static const int HSA_GET_READONLY_DATA = 0x4805; +static const int HSA_GET_CU_CNT = 0x4806; +static const int HSA_GET_VSZ = 0x4807; + +// Return value (via buffer ptr) for HSA_GET_SIZES +struct HsaDriverSizes +{ + uint32_t num_kernels; + uint32_t string_table_size; + uint32_t code_size; + uint32_t readonly_size; +}; + +// HSA_GET_KINFO returns an array of num_kernels of these structs +struct HsaKernelInfo +{ + // byte offset into string table + uint32_t name_offs; + // byte offset into code array + uint32_t code_offs; + uint32_t static_lds_size; + uint32_t private_mem_size; + uint32_t spill_mem_size; + // Number of s registers + uint32_t sRegCount; + // Number of d registers + uint32_t dRegCount; + // Number of c registers + uint32_t cRegCount; +}; + +#endif // __HSA_KERNEL_INFO_HH__ diff --git a/src/gpu-compute/hsa_object.cc b/src/gpu-compute/hsa_object.cc new file mode 100644 index 000000000..91dfb160e --- /dev/null +++ b/src/gpu-compute/hsa_object.cc @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "gpu-compute/hsa_object.hh" + +#include + +#include "gpu-compute/brig_object.hh" + +HsaObject::HsaObject(const std::string &fname) + : readonlyData(nullptr), filename(fname) +{ +} + +HsaObject* +HsaObject::createHsaObject(const std::string &fname) +{ + HsaObject *hsaObj = nullptr; + uint8_t *file_data = nullptr; + int file_length = 0; + + std::ifstream code_file(fname, std::ifstream::ate | std::ifstream::in | + std::ifstream::binary); + + assert(code_file.is_open()); + assert(code_file.good()); + + file_length = code_file.tellg(); + code_file.seekg(0, code_file.beg); + file_data = new uint8_t[file_length]; + code_file.read((char*)file_data, file_length); + code_file.close(); + + for (const auto &tryFile : tryFileFuncs) { + if ((hsaObj = tryFile(fname, file_length, file_data))) { + return hsaObj; + } + } + + delete[] file_data; + fatal("Unknown HSA object type for file: %s.\n", fname); + + return nullptr; +} diff --git a/src/gpu-compute/hsa_object.hh b/src/gpu-compute/hsa_object.hh new file mode 100644 index 000000000..1f08f5d80 --- /dev/null +++ b/src/gpu-compute/hsa_object.hh @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __HSA_OBJECT_HH__ +#define __HSA_OBJECT_HH__ + +#include +#include +#include + +class HsaCode; + +/* @class HsaObject + * base loader object for HSA kernels. this class provides + * the base method definitions for loading, storing, and + * accessing HSA kernel objects into the simulator. + */ + +class HsaObject +{ + public: + HsaObject(const std::string &fileName); + + static HsaObject* createHsaObject(const std::string &fname); + static std::vector> tryFileFuncs; + + virtual HsaCode* getKernel(const std::string &name) const = 0; + virtual HsaCode* getKernel(int i) const = 0; + virtual HsaCode* getFunction(const std::string &name) const = 0; + virtual int numKernels() const = 0; + + const std::string& name() const { return filename; } + + uint8_t *readonlyData; + + + protected: + const std::string filename; +}; + +#endif // __HSA_OBJECT_HH__ diff --git a/src/gpu-compute/hsail_code.cc b/src/gpu-compute/hsail_code.cc new file mode 100644 index 000000000..b0ddf0161 --- /dev/null +++ b/src/gpu-compute/hsail_code.cc @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "gpu-compute/hsail_code.hh" + +#include "arch/gpu_types.hh" +#include "arch/hsail/Brig.h" +#include "arch/hsail/operand.hh" +#include "config/the_gpu_isa.hh" +#include "debug/BRIG.hh" +#include "debug/HSAILObject.hh" +#include "gpu-compute/brig_object.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/kernel_cfg.hh" + +using namespace Brig; + +int getBrigDataTypeBytes(BrigType16_t t); + +HsailCode::HsailCode(const std::string &name_str) + : HsaCode(name_str), private_size(-1), readonly_size(-1) +{ +} + +void +HsailCode::init(const BrigDirectiveExecutable *code_dir, const BrigObject *obj, + StorageMap *objStorageMap) +{ + storageMap = objStorageMap; + + // set pointer so that decoding process can find this kernel context when + // needed + obj->currentCode = this; + + if (code_dir->base.kind != BRIG_KIND_DIRECTIVE_FUNCTION && + code_dir->base.kind != BRIG_KIND_DIRECTIVE_KERNEL) { + fatal("unexpected directive kind %d inside kernel/function init\n", + code_dir->base.kind); + } + + DPRINTF(HSAILObject, "Initializing code, first code block entry is: %d\n", + code_dir->firstCodeBlockEntry); + + // clear these static vars so we can properly track the max index + // for this kernel + SRegOperand::maxRegIdx = 0; + DRegOperand::maxRegIdx = 0; + CRegOperand::maxRegIdx = 0; + setPrivateSize(0); + + const BrigBase *entryPtr = brigNext((BrigBase*)code_dir); + const BrigBase *endPtr = + obj->getCodeSectionEntry(code_dir->nextModuleEntry); + + int inst_idx = 0; + std::vector instructions; + int funcarg_size_scope = 0; + + // walk through instructions in code section and directives in + // directive section in parallel, processing directives that apply + // when we reach the relevant code point. + while (entryPtr < endPtr) { + switch (entryPtr->kind) { + case BRIG_KIND_DIRECTIVE_VARIABLE: + { + const BrigDirectiveVariable *sym = + (const BrigDirectiveVariable*)entryPtr; + + DPRINTF(HSAILObject,"Initializing code, directive is " + "kind_variable, symbol is: %s\n", + obj->getString(sym->name)); + + StorageElement *se = storageMap->addSymbol(sym, obj); + + if (sym->segment == BRIG_SEGMENT_PRIVATE) { + setPrivateSize(se->size); + } else { // spill + funcarg_size_scope += se->size; + } + } + break; + + case BRIG_KIND_DIRECTIVE_LABEL: + { + const BrigDirectiveLabel *lbl = + (const BrigDirectiveLabel*)entryPtr; + + DPRINTF(HSAILObject,"Initializing code, directive is " + "kind_label, label is: %s \n", + obj->getString(lbl->name)); + + labelMap.addLabel(lbl, inst_idx, obj); + } + break; + + case BRIG_KIND_DIRECTIVE_PRAGMA: + { + DPRINTF(HSAILObject, "Initializing code, directive " + "is kind_pragma\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_COMMENT: + { + DPRINTF(HSAILObject, "Initializing code, directive is " + "kind_comment\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START: + { + DPRINTF(HSAILObject, "Initializing code, directive is " + "kind_arg_block_start\n"); + + storageMap->resetOffset(BRIG_SEGMENT_ARG); + funcarg_size_scope = 0; + } + break; + + case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END: + { + DPRINTF(HSAILObject, "Initializing code, directive is " + "kind_arg_block_end\n"); + + funcarg_size = funcarg_size < funcarg_size_scope ? + funcarg_size_scope : funcarg_size; + } + break; + + case BRIG_KIND_DIRECTIVE_END: + DPRINTF(HSAILObject, "Initializing code, dircetive is " + "kind_end\n"); + + break; + + default: + if (entryPtr->kind >= BRIG_KIND_INST_BEGIN && + entryPtr->kind <= BRIG_KIND_INST_END) { + + BrigInstBase *instPtr = (BrigInstBase*)entryPtr; + TheGpuISA::MachInst machInst = { instPtr, obj }; + GPUStaticInst *iptr = decoder.decode(machInst); + + if (iptr) { + DPRINTF(HSAILObject, "Initializing code, processing inst " + "#%d idx %d: OPCODE=%d\n", + inst_idx, _insts.size(), instPtr->opcode); + + TheGpuISA::RawMachInst inst_num = decoder.saveInst(iptr); + iptr->instNum(inst_idx); + _insts.push_back(inst_num); + instructions.push_back(iptr); + } + ++inst_idx; + } else if (entryPtr->kind >= BRIG_KIND_OPERAND_BEGIN && + entryPtr->kind < BRIG_KIND_OPERAND_END) { + warn("unexpected operand entry in code segment\n"); + } else { + // there are surely some more cases we will need to handle, + // but we'll deal with them as we find them. + fatal("unexpected directive kind %d inside kernel scope\n", + entryPtr->kind); + } + } + + entryPtr = brigNext(entryPtr); + } + + // compute Control Flow Graph for current kernel + ControlFlowInfo::assignImmediatePostDominators(instructions); + + max_sreg = SRegOperand::maxRegIdx; + max_dreg = DRegOperand::maxRegIdx; + max_creg = CRegOperand::maxRegIdx; + + obj->currentCode = nullptr; +} + +HsailCode::HsailCode(const std::string &name_str, + const BrigDirectiveExecutable *code_dir, + const BrigObject *obj, StorageMap *objStorageMap) + : HsaCode(name_str), private_size(-1), readonly_size(-1) +{ + init(code_dir, obj, objStorageMap); +} + +void +LabelMap::addLabel(const Brig::BrigDirectiveLabel *lblDir, int inst_index, + const BrigObject *obj) +{ + std::string lbl_name = obj->getString(lblDir->name); + Label &lbl = map[lbl_name]; + + if (lbl.defined()) { + fatal("Attempt to redefine existing label %s\n", lbl_name); + } + + lbl.define(lbl_name, inst_index); + DPRINTF(HSAILObject, "label %s = %d\n", lbl_name, inst_index); +} + +Label* +LabelMap::refLabel(const Brig::BrigDirectiveLabel *lblDir, + const BrigObject *obj) +{ + std::string name = obj->getString(lblDir->name); + Label &lbl = map[name]; + lbl.checkName(name); + + return &lbl; +} + +int +getBrigDataTypeBytes(BrigType16_t t) +{ + switch (t) { + case BRIG_TYPE_S8: + case BRIG_TYPE_U8: + case BRIG_TYPE_B8: + return 1; + + case BRIG_TYPE_S16: + case BRIG_TYPE_U16: + case BRIG_TYPE_B16: + case BRIG_TYPE_F16: + return 2; + + case BRIG_TYPE_S32: + case BRIG_TYPE_U32: + case BRIG_TYPE_B32: + case BRIG_TYPE_F32: + return 4; + + case BRIG_TYPE_S64: + case BRIG_TYPE_U64: + case BRIG_TYPE_B64: + case BRIG_TYPE_F64: + return 8; + + case BRIG_TYPE_B1: + + default: + fatal("unhandled symbol data type %d", t); + return 0; + } +} + +StorageElement* +StorageSpace::addSymbol(const BrigDirectiveVariable *sym, + const BrigObject *obj) +{ + const char *sym_name = obj->getString(sym->name); + uint64_t size = 0; + uint64_t offset = 0; + + if (sym->type & BRIG_TYPE_ARRAY) { + size = getBrigDataTypeBytes(sym->type & ~BRIG_TYPE_ARRAY); + size *= (((uint64_t)sym->dim.hi) << 32 | (uint64_t)sym->dim.lo); + + offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type & + ~BRIG_TYPE_ARRAY)); + } else { + size = getBrigDataTypeBytes(sym->type); + offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type)); + } + + nextOffset = offset + size; + + DPRINTF(HSAILObject, "Adding %s SYMBOL %s size %d offset 0x%x, init: %d\n", + segmentNames[segment], sym_name, size, offset, sym->init); + + StorageElement* se = new StorageElement(sym_name, offset, size, sym); + elements.push_back(se); + elements_by_addr.insert(AddrRange(offset, offset + size - 1), se); + elements_by_brigptr[sym] = se; + + return se; +} + +StorageElement* +StorageSpace::findSymbol(std::string name) +{ + for (auto it : elements) { + if (it->name == name) { + return it; + } + } + + return nullptr; +} + +StorageElement* +StorageSpace::findSymbol(uint64_t addr) +{ + assert(elements_by_addr.size() > 0); + + auto se = elements_by_addr.find(addr); + + if (se == elements_by_addr.end()) { + return nullptr; + } else { + return se->second; + } +} + +StorageElement* +StorageSpace::findSymbol(const BrigDirectiveVariable *brigptr) +{ + assert(elements_by_brigptr.size() > 0); + + auto se = elements_by_brigptr.find(brigptr); + + if (se == elements_by_brigptr.end()) { + return nullptr; + } else { + return se->second; + } +} + +StorageMap::StorageMap(StorageMap *outerScope) + : outerScopeMap(outerScope) +{ + for (int i = 0; i < NumSegments; ++i) + space[i] = new StorageSpace((BrigSegment)i); +} + +StorageElement* +StorageMap::addSymbol(const BrigDirectiveVariable *sym, const BrigObject *obj) +{ + BrigSegment8_t segment = sym->segment; + + assert(segment >= Brig::BRIG_SEGMENT_FLAT); + assert(segment < NumSegments); + + return space[segment]->addSymbol(sym, obj); +} + +int +StorageMap::getSize(Brig::BrigSegment segment) +{ + assert(segment > Brig::BRIG_SEGMENT_GLOBAL); + assert(segment < NumSegments); + + if (segment != Brig::BRIG_SEGMENT_GROUP && + segment != Brig::BRIG_SEGMENT_READONLY) { + return space[segment]->getSize(); + } else { + int ret = space[segment]->getSize(); + + if (outerScopeMap) { + ret += outerScopeMap->getSize(segment); + } + + return ret; + } +} + +void +StorageMap::resetOffset(Brig::BrigSegment segment) +{ + space[segment]->resetOffset(); +} + +StorageElement* +StorageMap::findSymbol(BrigSegment segment, std::string name) +{ + StorageElement *se = space[segment]->findSymbol(name); + + if (se) + return se; + + if (outerScopeMap) + return outerScopeMap->findSymbol(segment, name); + + return nullptr; +} + +StorageElement* +StorageMap::findSymbol(Brig::BrigSegment segment, uint64_t addr) +{ + StorageSpace *sp = space[segment]; + + if (!sp) { + // there is no memory in segment? + return nullptr; + } + + StorageElement *se = sp->findSymbol(addr); + + if (se) + return se; + + if (outerScopeMap) + return outerScopeMap->findSymbol(segment, addr); + + return nullptr; + +} + +StorageElement* +StorageMap::findSymbol(Brig::BrigSegment segment, + const BrigDirectiveVariable *brigptr) +{ + StorageSpace *sp = space[segment]; + + if (!sp) { + // there is no memory in segment? + return nullptr; + } + + StorageElement *se = sp->findSymbol(brigptr); + + if (se) + return se; + + if (outerScopeMap) + return outerScopeMap->findSymbol(segment, brigptr); + + return nullptr; + +} diff --git a/src/gpu-compute/hsail_code.hh b/src/gpu-compute/hsail_code.hh new file mode 100644 index 000000000..d9fbcc577 --- /dev/null +++ b/src/gpu-compute/hsail_code.hh @@ -0,0 +1,447 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __HSAIL_CODE_HH__ +#define __HSAIL_CODE_HH__ + +#include +#include +#include +#include +#include + +#include "arch/gpu_decoder.hh" +#include "arch/hsail/Brig.h" +#include "base/addr_range_map.hh" +#include "base/intmath.hh" +#include "config/the_gpu_isa.hh" +#include "gpu-compute/hsa_code.hh" +#include "gpu-compute/hsa_kernel_info.hh" +#include "gpu-compute/misc.hh" + +class BrigObject; +class GPUStaticInst; + +inline int +popcount(uint64_t src, int sz) +{ + int cnt = 0; + + for (int i = 0; i < sz; ++i) { + if (src & 1) + ++cnt; + src >>= 1; + } + + return cnt; +} + +inline int +firstbit(uint64_t src, int sz) +{ + int i; + + for (i = 0; i < sz; ++i) { + if (src & 1) + break; + src >>= 1; + } + + return i; +} + +inline int +lastbit(uint64_t src, int sz) +{ + int i0 = -1; + + for (int i = 0; i < sz; ++i) { + if (src & 1) + i0 = i; + src >>= 1; + } + + return i0; +} + +inline int +signbit(uint64_t src, int sz) +{ + int i0 = -1; + + if (src & (1 << (sz - 1))) { + for (int i = 0; i < sz - 1; ++i) { + if (!(src & 1)) + i0 = i; + src >>= 1; + } + } else { + for (int i = 0; i < sz - 1; ++i) { + if (src & 1) + i0 = i; + src >>= 1; + } + } + + return i0; +} + +inline uint64_t +bitrev(uint64_t src, int sz) +{ + uint64_t r = 0; + + for (int i = 0; i < sz; ++i) { + r <<= 1; + if (src & 1) + r |= 1; + src >>= 1; + } + + return r; +} + +inline uint64_t +mul_hi(uint32_t a, uint32_t b) +{ + return ((uint64_t)a * (uint64_t)b) >> 32; +} + +inline uint64_t +mul_hi(int32_t a, int32_t b) +{ + return ((int64_t)a * (int64_t)b) >> 32; +} + +inline uint64_t +mul_hi(uint64_t a, uint64_t b) +{ + return ((uint64_t)a * (uint64_t)b) >> 32; +} + +inline uint64_t +mul_hi(int64_t a, int64_t b) +{ + return ((int64_t)a * (int64_t)b) >> 32; +} + +inline uint64_t +mul_hi(double a, double b) +{ + return 0; +} + +class Label +{ + public: + std::string name; + int value; + + Label() : value(-1) + { + } + + bool defined() { return value != -1; } + + void + checkName(std::string &_name) + { + if (name.empty()) { + name = _name; + } else { + assert(name == _name); + } + } + + void + define(std::string &_name, int _value) + { + assert(!defined()); + assert(_value != -1); + value = _value; + checkName(_name); + } + + int + get() + { + assert(defined()); + return value; + } +}; + +class LabelMap +{ + std::map map; + + public: + LabelMap() { } + + void addLabel(const Brig::BrigDirectiveLabel *lbl, int inst_index, + const BrigObject *obj); + + Label *refLabel(const Brig::BrigDirectiveLabel *lbl, + const BrigObject *obj); +}; + +const int NumSegments = Brig::BRIG_SEGMENT_AMD_GCN; + +extern const char *segmentNames[]; + +class StorageElement +{ + public: + std::string name; + uint64_t offset; + + uint64_t size; + const Brig::BrigDirectiveVariable *brigSymbol; + StorageElement(const char *_name, uint64_t _offset, int _size, + const Brig::BrigDirectiveVariable *sym) + : name(_name), offset(_offset), size(_size), brigSymbol(sym) + { + } +}; + +class StorageSpace +{ + typedef std::map + DirVarToSE_map; + + std::list elements; + AddrRangeMap elements_by_addr; + DirVarToSE_map elements_by_brigptr; + + uint64_t nextOffset; + Brig::BrigSegment segment; + + public: + StorageSpace(Brig::BrigSegment _class) + : nextOffset(0), segment(_class) + { + } + + StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym, + const BrigObject *obj); + + StorageElement* findSymbol(std::string name); + StorageElement* findSymbol(uint64_t addr); + StorageElement* findSymbol(const Brig::BrigDirectiveVariable *brigptr); + + int getSize() { return nextOffset; } + void resetOffset() { nextOffset = 0; } +}; + +class StorageMap +{ + StorageMap *outerScopeMap; + StorageSpace *space[NumSegments]; + + public: + StorageMap(StorageMap *outerScope = nullptr); + + StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym, + const BrigObject *obj); + + StorageElement* findSymbol(Brig::BrigSegment segment, std::string name); + StorageElement* findSymbol(Brig::BrigSegment segment, uint64_t addr); + + StorageElement* findSymbol(Brig::BrigSegment segment, + const Brig::BrigDirectiveVariable *brigptr); + + // overloaded version to avoid casting + StorageElement* + findSymbol(Brig::BrigSegment8_t segment, std::string name) + { + return findSymbol((Brig::BrigSegment)segment, name); + } + + int getSize(Brig::BrigSegment segment); + void resetOffset(Brig::BrigSegment segment); +}; + +typedef enum +{ + BT_DEFAULT, + BT_B8, + BT_U8, + BT_U16, + BT_U32, + BT_U64, + BT_S8, + BT_S16, + BT_S32, + BT_S64, + BT_F16, + BT_F32, + BT_F64, + BT_NULL +} base_type_e; + +/* @class HsailCode + * the HsailCode class is used to store information + * about HSA kernels stored in the BRIG format. it holds + * all information about a kernel, function, or variable + * symbol and provides methods for accessing that + * information. + */ + +class HsailCode final : public HsaCode +{ + public: + TheGpuISA::Decoder decoder; + + StorageMap *storageMap; + LabelMap labelMap; + uint32_t kernarg_start; + uint32_t kernarg_end; + int32_t private_size; + + int32_t readonly_size; + + // We track the maximum register index used for each register + // class when we load the code so we can size the register files + // appropriately (i.e., one more than the max index). + uint32_t max_creg; // maximum c-register index + uint32_t max_sreg; // maximum s-register index + uint32_t max_dreg; // maximum d-register index + + HsailCode(const std::string &name_str, + const Brig::BrigDirectiveExecutable *code_dir, + const BrigObject *obj, + StorageMap *objStorageMap); + + // this version is used to create a placeholder when + // we encounter a kernel-related directive before the + // kernel itself + HsailCode(const std::string &name_str); + + void init(const Brig::BrigDirectiveExecutable *code_dir, + const BrigObject *obj, StorageMap *objStorageMap); + + void + generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const + { + hsaKernelInfo->sRegCount = max_sreg + 1; + hsaKernelInfo->dRegCount = max_dreg + 1; + hsaKernelInfo->cRegCount = max_creg + 1; + + hsaKernelInfo->static_lds_size = getSize(Brig::BRIG_SEGMENT_GROUP); + + hsaKernelInfo->private_mem_size = + roundUp(getSize(Brig::BRIG_SEGMENT_PRIVATE), 8); + + hsaKernelInfo->spill_mem_size = + roundUp(getSize(Brig::BRIG_SEGMENT_SPILL), 8); + } + + int + getSize(MemorySegment segment) const + { + Brig::BrigSegment brigSeg; + + switch (segment) { + case MemorySegment::NONE: + brigSeg = Brig::BRIG_SEGMENT_NONE; + break; + case MemorySegment::FLAT: + brigSeg = Brig::BRIG_SEGMENT_FLAT; + break; + case MemorySegment::GLOBAL: + brigSeg = Brig::BRIG_SEGMENT_GLOBAL; + break; + case MemorySegment::READONLY: + brigSeg = Brig::BRIG_SEGMENT_READONLY; + break; + case MemorySegment::KERNARG: + brigSeg = Brig::BRIG_SEGMENT_KERNARG; + break; + case MemorySegment::GROUP: + brigSeg = Brig::BRIG_SEGMENT_GROUP; + break; + case MemorySegment::PRIVATE: + brigSeg = Brig::BRIG_SEGMENT_PRIVATE; + break; + case MemorySegment::SPILL: + brigSeg = Brig::BRIG_SEGMENT_SPILL; + break; + case MemorySegment::ARG: + brigSeg = Brig::BRIG_SEGMENT_ARG; + break; + case MemorySegment::EXTSPACE0: + brigSeg = Brig::BRIG_SEGMENT_AMD_GCN; + break; + default: + fatal("Unknown BrigSegment type.\n"); + } + + return getSize(brigSeg); + } + + private: + int + getSize(Brig::BrigSegment segment) const + { + if (segment == Brig::BRIG_SEGMENT_PRIVATE) { + // with the code generated by new HSA compiler the assertion + // does not hold anymore.. + //assert(private_size != -1); + return private_size; + } else { + return storageMap->getSize(segment); + } + } + + public: + StorageElement* + findSymbol(Brig::BrigSegment segment, uint64_t addr) + { + return storageMap->findSymbol(segment, addr); + } + + void + setPrivateSize(int32_t _private_size) + { + private_size = _private_size; + } + + Label* + refLabel(const Brig::BrigDirectiveLabel *lbl, const BrigObject *obj) + { + return labelMap.refLabel(lbl, obj); + } +}; + +#endif // __HSAIL_CODE_HH__ diff --git a/src/gpu-compute/kernel_cfg.cc b/src/gpu-compute/kernel_cfg.cc new file mode 100644 index 000000000..7e0e10912 --- /dev/null +++ b/src/gpu-compute/kernel_cfg.cc @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "gpu-compute/kernel_cfg.hh" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gpu-compute/gpu_static_inst.hh" + +void +ControlFlowInfo::assignImmediatePostDominators( + const std::vector& instructions) +{ + ControlFlowInfo cfg(instructions); + cfg.findImmediatePostDominators(); +} + + +ControlFlowInfo::ControlFlowInfo(const std::vector& insts) : + instructions(insts) +{ + createBasicBlocks(); + connectBasicBlocks(); +} + +BasicBlock* +ControlFlowInfo::basicBlock(int inst_num) const { + for (auto& block: basicBlocks) { + int first_block_id = block->firstInstruction->instNum(); + if (inst_num >= first_block_id && + inst_num < first_block_id + block->size) { + return block.get(); + } + } + return nullptr; +} + + +GPUStaticInst* +ControlFlowInfo::lastInstruction(const BasicBlock* block) const +{ + if (block->isExit()) { + return nullptr; + } + + return instructions.at(block->firstInstruction->instNum() + + block->size - 1); +} + +BasicBlock* +ControlFlowInfo::postDominator(const BasicBlock* block) const +{ + if (block->isExit()) { + return nullptr; + } + return basicBlock(lastInstruction(block)->ipdInstNum()); +} + +void +ControlFlowInfo::createBasicBlocks() +{ + assert(!instructions.empty()); + std::set leaders; + // first instruction is a leader + leaders.insert(0); + for (int i = 1; i < instructions.size(); i++) { + GPUStaticInst* instruction = instructions[i]; + if (instruction->o_type == Enums::OT_BRANCH) { + const int target_pc = instruction->getTargetPc(); + leaders.insert(target_pc); + leaders.insert(i + 1); + } + } + + size_t block_size = 0; + for (int i = 0; i < instructions.size(); i++) { + if (leaders.find(i) != leaders.end()) { + uint32_t id = basicBlocks.size(); + if (id > 0) { + basicBlocks.back()->size = block_size; + } + block_size = 0; + basicBlocks.emplace_back(new BasicBlock(id, instructions[i])); + } + block_size++; + } + basicBlocks.back()->size = block_size; + // exit basic block + basicBlocks.emplace_back(new BasicBlock(basicBlocks.size(), nullptr)); +} + +void +ControlFlowInfo::connectBasicBlocks() +{ + BasicBlock* exit_bb = basicBlocks.back().get(); + for (auto& bb : basicBlocks) { + if (bb->isExit()) { + break; + } + GPUStaticInst* last = lastInstruction(bb.get()); + if (last->o_type == Enums::OT_RET) { + bb->successorIds.insert(exit_bb->id); + break; + } + if (last->o_type == Enums::OT_BRANCH) { + const uint32_t target_pc = last->getTargetPc(); + BasicBlock* target_bb = basicBlock(target_pc); + bb->successorIds.insert(target_bb->id); + } + + // Unconditional jump instructions have a unique successor + if (!last->unconditionalJumpInstruction()) { + BasicBlock* next_bb = basicBlock(last->instNum() + 1); + bb->successorIds.insert(next_bb->id); + } + } +} + + +// In-place set intersection +static void +intersect(std::set& a, const std::set& b) +{ + std::set::iterator it = a.begin(); + while (it != a.end()) { + it = b.find(*it) != b.end() ? ++it : a.erase(it); + } +} + + +void +ControlFlowInfo::findPostDominators() +{ + // the only postdominator of the exit block is itself + basicBlocks.back()->postDominatorIds.insert(basicBlocks.back()->id); + //copy all basic blocks to all postdominator lists except for exit block + for (auto& block : basicBlocks) { + if (!block->isExit()) { + for (uint32_t i = 0; i < basicBlocks.size(); i++) { + block->postDominatorIds.insert(i); + } + } + } + + bool change = true; + while (change) { + change = false; + for (int h = basicBlocks.size() - 2; h >= 0; --h) { + size_t num_postdominators = + basicBlocks[h]->postDominatorIds.size(); + for (int s : basicBlocks[h]->successorIds) { + intersect(basicBlocks[h]->postDominatorIds, + basicBlocks[s]->postDominatorIds); + } + basicBlocks[h]->postDominatorIds.insert(h); + change |= (num_postdominators + != basicBlocks[h]->postDominatorIds.size()); + } + } +} + + +// In-place set difference +static void +setDifference(std::set&a, + const std::set& b, uint32_t exception) +{ + for (uint32_t b_elem : b) { + if (b_elem != exception) { + a.erase(b_elem); + } + } +} + +void +ControlFlowInfo::findImmediatePostDominators() +{ + assert(basicBlocks.size() > 1); // Entry and exit blocks must be present + + findPostDominators(); + + for (auto& basicBlock : basicBlocks) { + if (basicBlock->isExit()) { + continue; + } + std::set candidates = basicBlock->postDominatorIds; + candidates.erase(basicBlock->id); + for (uint32_t postDominatorId : basicBlock->postDominatorIds) { + if (postDominatorId != basicBlock->id) { + setDifference(candidates, + basicBlocks[postDominatorId]->postDominatorIds, + postDominatorId); + } + } + assert(candidates.size() == 1); + GPUStaticInst* last_instruction = lastInstruction(basicBlock.get()); + BasicBlock* ipd_block = basicBlocks[*(candidates.begin())].get(); + if (!ipd_block->isExit()) { + GPUStaticInst* ipd_first_inst = ipd_block->firstInstruction; + last_instruction->ipdInstNum(ipd_first_inst->instNum()); + } else { + last_instruction->ipdInstNum(last_instruction->instNum() + 1); + } + } +} + +void +ControlFlowInfo::printPostDominators() const +{ + for (auto& block : basicBlocks) { + std::cout << "PD(" << block->id << ") = {"; + std::copy(block->postDominatorIds.begin(), + block->postDominatorIds.end(), + std::ostream_iterator(std::cout, ", ")); + std::cout << "}" << std::endl; + } +} + +void +ControlFlowInfo::printImmediatePostDominators() const +{ + for (const auto& block : basicBlocks) { + if (block->isExit()) { + continue; + } + std::cout << "IPD(" << block->id << ") = "; + std::cout << postDominator(block.get())->id << ", "; + } + std::cout << std::endl; +} +void +ControlFlowInfo::printBasicBlocks() const +{ + for (GPUStaticInst* inst : instructions) { + int inst_num = inst->instNum(); + std::cout << inst_num << " [" << basicBlock(inst_num)->id + << "]: " << inst->disassemble(); + if (inst->o_type == Enums::OT_BRANCH) { + std::cout << ", PC = " << inst->getTargetPc(); + } + std::cout << std::endl; + } +} + +void +ControlFlowInfo::printBasicBlockDot() const +{ + printf("digraph {\n"); + for (const auto& basic_block : basicBlocks) { + printf("\t"); + for (uint32_t successorId : basic_block->successorIds) { + printf("%d -> %d; ", basic_block->id, successorId); + } + printf("\n"); + } + printf("}\n"); +} diff --git a/src/gpu-compute/kernel_cfg.hh b/src/gpu-compute/kernel_cfg.hh new file mode 100644 index 000000000..74ea861d8 --- /dev/null +++ b/src/gpu-compute/kernel_cfg.hh @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __KERNEL_CFG_HH__ +#define __KERNEL_CFG_HH__ + +#include +#include +#include +#include +#include + + +class GPUStaticInst; +class HsailCode; + +struct BasicBlock +{ + BasicBlock(uint32_t num, GPUStaticInst* begin) : + id(num), size(0), firstInstruction(begin) + { + } + + bool + isEntry() const + { + return !id; + } + + bool + isExit() const + { + return !size; + } + + /** + * Unique identifier for the block within a given kernel. + */ + const uint32_t id; + + /** + * Number of instructions contained in the block + */ + size_t size; + + /** + * Pointer to first instruction of the block. + */ + GPUStaticInst* firstInstruction; + + /** + * Identifiers of the blocks that follow (are reachable from) this block. + */ + std::set successorIds; + + /** + * Identifiers of the blocks that will be visited from this block. + */ + std::set postDominatorIds; +}; + +class ControlFlowInfo +{ +public: + + /** + * Compute immediate post-dominator instruction for kernel instructions. + */ + static void assignImmediatePostDominators( + const std::vector& instructions); + +private: + ControlFlowInfo(const std::vector& instructions); + + GPUStaticInst* lastInstruction(const BasicBlock* block) const; + + BasicBlock* basicBlock(int inst_num) const; + + BasicBlock* postDominator(const BasicBlock* block) const; + + void createBasicBlocks(); + + void connectBasicBlocks(); + + void findPostDominators(); + + void findImmediatePostDominators(); + + void printBasicBlocks() const; + + void printBasicBlockDot() const; + + void printPostDominators() const; + + void printImmediatePostDominators() const; + + std::vector> basicBlocks; + std::vector instructions; +}; + +#endif // __KERNEL_CFG_HH__ diff --git a/src/gpu-compute/lds_state.cc b/src/gpu-compute/lds_state.cc new file mode 100644 index 000000000..91ee8009a --- /dev/null +++ b/src/gpu-compute/lds_state.cc @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Joe Gross + */ + +#include "gpu-compute/lds_state.hh" + +#include +#include +#include + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" + +/** + * the default constructor that works with SWIG + */ +LdsState::LdsState(const Params *params) : + MemObject(params), + tickEvent(this), + cuPort(name() + ".port", this), + maximumSize(params->size), + range(params->range), + bankConflictPenalty(params->bankConflictPenalty), + banks(params->banks) +{ + fatal_if(params->banks <= 0, + "Number of LDS banks should be positive number"); + fatal_if((params->banks & (params->banks - 1)) != 0, + "Number of LDS banks should be a power of 2"); + fatal_if(params->size <= 0, + "cannot allocate an LDS with a size less than 1"); + fatal_if(params->size % 2, + "the LDS should be an even number"); +} + +/** + * Needed by the SWIG compiler + */ +LdsState * +LdsStateParams::create() +{ + return new LdsState(this); +} + +/** + * set the parent and name based on the parent + */ +void +LdsState::setParent(ComputeUnit *x_parent) +{ + // check that this gets assigned to the same thing each time + fatal_if(!x_parent, "x_parent should not be nullptr"); + fatal_if(x_parent == parent, + "should not be setting the parent twice"); + + parent = x_parent; + _name = x_parent->name() + ".LdsState"; +} + +/** + * derive the gpu mem packet from the packet and then count the bank conflicts + */ +unsigned +LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses) +{ + Packet::SenderState *baseSenderState = packet->senderState; + while (baseSenderState->predecessor) { + baseSenderState = baseSenderState->predecessor; + } + const ComputeUnit::LDSPort::SenderState *senderState = + dynamic_cast(baseSenderState); + + fatal_if(!senderState, + "did not get the right sort of sender state"); + + GPUDynInstPtr gpuDynInst = senderState->getMemInst(); + + return countBankConflicts(gpuDynInst, bankAccesses); +} + +// Count the total number of bank conflicts for the local memory packet +unsigned +LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst, + unsigned *numBankAccesses) +{ + int bank_conflicts = 0; + std::vector bank; + // the number of LDS banks being touched by the memory instruction + int numBanks = std::min(parent->wfSize(), banks); + // if the wavefront size is larger than the number of LDS banks, we + // need to iterate over all work items to calculate the total + // number of bank conflicts + int groups = (parent->wfSize() > numBanks) ? + (parent->wfSize() / numBanks) : 1; + for (int i = 0; i < groups; i++) { + // Address Array holding all the work item addresses of an instruction + std::vector addr_array; + addr_array.resize(numBanks, 0); + bank.clear(); + bank.resize(banks, 0); + int max_bank = 0; + + // populate the address array for all active work items + for (int j = 0; j < numBanks; j++) { + if (gpuDynInst->exec_mask[(i*numBanks)+j]) { + addr_array[j] = gpuDynInst->addr[(i*numBanks)+j]; + } else { + addr_array[j] = std::numeric_limits::max(); + } + } + + if (gpuDynInst->m_op == Enums::MO_LD || + gpuDynInst->m_op == Enums::MO_ST) { + // mask identical addresses + for (int j = 0; j < numBanks; ++j) { + for (int j0 = 0; j0 < j; j0++) { + if (addr_array[j] != std::numeric_limits::max() + && addr_array[j] == addr_array[j0]) { + addr_array[j] = std::numeric_limits::max(); + } + } + } + } + // calculate bank conflicts + for (int j = 0; j < numBanks; ++j) { + if (addr_array[j] != std::numeric_limits::max()) { + int bankId = addr_array[j] % banks; + bank[bankId]++; + max_bank = std::max(max_bank, bank[bankId]); + // Count the number of LDS banks accessed. + // Since we have masked identical addresses all remaining + // accesses will need to be serialized if they access + // the same bank (bank conflict). + (*numBankAccesses)++; + } + } + bank_conflicts += max_bank; + } + panic_if(bank_conflicts > parent->wfSize(), + "Max bank conflicts should match num of work items per instr"); + return bank_conflicts; +} + +/** + * receive the packet from the CU + */ +bool +LdsState::CuSidePort::recvTimingReq(PacketPtr packet) +{ + return ownerLds->processPacket(packet); +} + +GPUDynInstPtr +LdsState::getDynInstr(PacketPtr packet) +{ + ComputeUnit::LDSPort::SenderState *ss = + dynamic_cast( + packet->senderState); + return ss->getMemInst(); +} + +/** + * process an incoming packet, add it to the return queue + */ +bool +LdsState::processPacket(PacketPtr packet) +{ + unsigned bankAccesses = 0; + // the number of conflicts this packet will have when accessing the LDS + unsigned bankConflicts = countBankConflicts(packet, &bankAccesses); + // count the total number of physical LDS bank accessed + parent->ldsBankAccesses += bankAccesses; + // count the LDS bank conflicts. A number set to 1 indicates one + // access per bank maximum so there are no bank conflicts + parent->ldsBankConflictDist.sample(bankConflicts-1); + + GPUDynInstPtr dynInst = getDynInstr(packet); + // account for the LDS bank conflict overhead + int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() : + (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() : + parent->loadBusLength(); + // delay for accessing the LDS + Tick processingTime = + parent->shader->ticks(bankConflicts * bankConflictPenalty) + + parent->shader->ticks(busLength); + // choose (delay + last packet in queue) or (now + delay) as the time to + // return this + Tick doneAt = earliestReturnTime() + processingTime; + // then store it for processing + return returnQueuePush(std::make_pair(doneAt, packet)); +} + +/** + * add this to the queue of packets to be returned + */ +bool +LdsState::returnQueuePush(std::pair thePair) +{ + // TODO add time limits (e.g. one packet per cycle) and queue size limits + // and implement flow control + returnQueue.push(thePair); + + // if there is no set wakeup time, look through the queue + if (!tickEvent.scheduled()) { + process(); + } + + return true; +} + +/** + * receive a packet in functional mode + */ +void +LdsState::CuSidePort::recvFunctional(PacketPtr pkt) +{ + fatal("not implemented"); +} + +/** + * receive a retry for a response + */ +void +LdsState::CuSidePort::recvRespRetry() +{ + // TODO verify that this is the right way to do this + assert(ownerLds->isRetryResp()); + ownerLds->setRetryResp(false); + ownerLds->process(); +} + +/** + * receive a retry + */ +void +LdsState::CuSidePort::recvRetry() +{ + fatal("not implemented"); +} + +/** + * look for packets to return at this time + */ +bool +LdsState::process() +{ + Tick now = clockEdge(); + + // send back completed packets + while (!returnQueue.empty() && returnQueue.front().first <= now) { + PacketPtr packet = returnQueue.front().second; + + ComputeUnit::LDSPort::SenderState *ss = + dynamic_cast( + packet->senderState); + + GPUDynInstPtr gpuDynInst = ss->getMemInst(); + + gpuDynInst->initiateAcc(gpuDynInst); + + packet->makeTimingResponse(); + + returnQueue.pop(); + + bool success = cuPort.sendTimingResp(packet); + + if (!success) { + retryResp = true; + panic("have not handled timing responses being NACK'd when sent" + "back"); + } + } + + // determine the next wakeup time + if (!returnQueue.empty()) { + + Tick next = returnQueue.front().first; + + if (tickEvent.scheduled()) { + + if (next < tickEvent.when()) { + + tickEvent.deschedule(); + tickEvent.schedule(next); + } + } else { + tickEvent.schedule(next); + } + } + + return true; +} + +/** + * wake up at this time and perform specified actions + */ +void +LdsState::TickEvent::process() +{ + ldsState->process(); +} + +/** + * + */ +void +LdsState::regStats() +{ +} diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh new file mode 100644 index 000000000..89f08a1d3 --- /dev/null +++ b/src/gpu-compute/lds_state.hh @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Joe Gross + */ + +#ifndef __LDS_STATE_HH__ +#define __LDS_STATE_HH__ + +#include +#include +#include +#include +#include +#include + +#include "enums/MemOpType.hh" +#include "enums/MemType.hh" +#include "gpu-compute/misc.hh" +#include "mem/mem_object.hh" +#include "mem/port.hh" +#include "params/LdsState.hh" + +class ComputeUnit; + +/** + * this represents a slice of the overall LDS, intended to be associated with an + * individual workgroup + */ +class LdsChunk +{ + public: + LdsChunk(const uint32_t x_size): + chunk(x_size) + { + } + + LdsChunk() {} + + /** + * a read operation + */ + template + T + read(const uint32_t index) + { + fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0"); + fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk"); + T *p0 = (T *) (&(chunk.at(index))); + return *p0; + } + + /** + * a write operation + */ + template + void + write(const uint32_t index, const T value) + { + fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0"); + fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk"); + T *p0 = (T *) (&(chunk.at(index))); + *p0 = value; + } + + /** + * get the size of this chunk + */ + std::vector::size_type + size() const + { + return chunk.size(); + } + + protected: + // the actual data store for this slice of the LDS + std::vector chunk; +}; + +// Local Data Share (LDS) State per Wavefront (contents of the LDS region +// allocated to the WorkGroup of this Wavefront) +class LdsState: public MemObject +{ + protected: + + /** + * an event to allow event-driven execution + */ + class TickEvent: public Event + { + protected: + + LdsState *ldsState = nullptr; + + Tick nextTick = 0; + + public: + + TickEvent(LdsState *_ldsState) : + ldsState(_ldsState) + { + } + + virtual void + process(); + + void + schedule(Tick when) + { + mainEventQueue[0]->schedule(this, when); + } + + void + deschedule() + { + mainEventQueue[0]->deschedule(this); + } + }; + + /** + * CuSidePort is the LDS Port closer to the CU side + */ + class CuSidePort: public SlavePort + { + public: + CuSidePort(const std::string &_name, LdsState *_ownerLds) : + SlavePort(_name, _ownerLds), ownerLds(_ownerLds) + { + } + + protected: + LdsState *ownerLds; + + virtual bool + recvTimingReq(PacketPtr pkt); + + virtual Tick + recvAtomic(PacketPtr pkt) + { + return 0; + } + + virtual void + recvFunctional(PacketPtr pkt); + + virtual void + recvRangeChange() + { + } + + virtual void + recvRetry(); + + virtual void + recvRespRetry(); + + virtual AddrRangeList + getAddrRanges() const + { + AddrRangeList ranges; + ranges.push_back(ownerLds->getAddrRange()); + return ranges; + } + + template + void + loadData(PacketPtr packet); + + template + void + storeData(PacketPtr packet); + + template + void + atomicOperation(PacketPtr packet); + }; + + protected: + + // the lds reference counter + // The key is the workgroup ID and dispatch ID + // The value is the number of wavefronts that reference this LDS, as + // wavefronts are launched, the counter goes up for that workgroup and when + // they return it decreases, once it reaches 0 then this chunk of the LDS is + // returned to the available pool. However,it is deallocated on the 1->0 + // transition, not whenever the counter is 0 as it always starts with 0 when + // the workgroup asks for space + std::unordered_map> refCounter; + + // the map that allows workgroups to access their own chunk of the LDS + std::unordered_map> chunkMap; + + // an event to allow the LDS to wake up at a specified time + TickEvent tickEvent; + + // the queue of packets that are going back to the CU after a + // read/write/atomic op + // TODO need to make this have a maximum size to create flow control + std::queue> returnQueue; + + // whether or not there are pending responses + bool retryResp = false; + + bool + process(); + + GPUDynInstPtr + getDynInstr(PacketPtr packet); + + bool + processPacket(PacketPtr packet); + + unsigned + countBankConflicts(PacketPtr packet, unsigned *bankAccesses); + + unsigned + countBankConflicts(GPUDynInstPtr gpuDynInst, + unsigned *numBankAccesses); + + public: + typedef LdsStateParams Params; + + LdsState(const Params *params); + + // prevent copy construction + LdsState(const LdsState&) = delete; + + ~LdsState() + { + parent = nullptr; + } + + const Params * + params() const + { + return dynamic_cast(_params); + } + + bool + isRetryResp() const + { + return retryResp; + } + + void + setRetryResp(const bool value) + { + retryResp = value; + } + + // prevent assignment + LdsState & + operator=(const LdsState &) = delete; + + /** + * use the dynamic wave id to create or just increase the reference count + */ + int + increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId) + { + int refCount = getRefCounter(dispatchId, wgId); + fatal_if(refCount < 0, + "reference count should not be below zero"); + return ++refCounter[dispatchId][wgId]; + } + + /** + * decrease the reference count after making sure it is in the list + * give back this chunk if the ref counter has reached 0 + */ + int + decreaseRefCounter(const uint32_t dispatchId, const uint32_t wgId) + { + int refCount = getRefCounter(dispatchId, wgId); + + fatal_if(refCount <= 0, + "reference count should not be below zero or at zero to" + "decrement"); + + refCounter[dispatchId][wgId]--; + + if (refCounter[dispatchId][wgId] == 0) { + releaseSpace(dispatchId, wgId); + return 0; + } else { + return refCounter[dispatchId][wgId]; + } + } + + /** + * return the current reference count for this workgroup id + */ + int + getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const + { + auto dispatchIter = chunkMap.find(dispatchId); + fatal_if(dispatchIter == chunkMap.end(), + "could not locate this dispatch id [%d]", dispatchId); + + auto workgroup = dispatchIter->second.find(wgId); + fatal_if(workgroup == dispatchIter->second.end(), + "could not find this workgroup id within this dispatch id" + " did[%d] wgid[%d]", dispatchId, wgId); + + auto refCountIter = refCounter.find(dispatchId); + if (refCountIter == refCounter.end()) { + fatal("could not locate this dispatch id [%d]", dispatchId); + } else { + auto workgroup = refCountIter->second.find(wgId); + if (workgroup == refCountIter->second.end()) { + fatal("could not find this workgroup id within this dispatch id" + " did[%d] wgid[%d]", dispatchId, wgId); + } else { + return refCounter.at(dispatchId).at(wgId); + } + } + + fatal("should not reach this point"); + return 0; + } + + /** + * assign a parent and request this amount of space be set aside + * for this wgid + */ + LdsChunk * + reserveSpace(const uint32_t dispatchId, const uint32_t wgId, + const uint32_t size) + { + if (chunkMap.find(dispatchId) != chunkMap.end()) { + fatal_if( + chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(), + "duplicate workgroup ID asking for space in the LDS " + "did[%d] wgid[%d]", dispatchId, wgId); + } + + fatal_if(bytesAllocated + size > maximumSize, + "request would ask for more space than is available"); + + bytesAllocated += size; + + chunkMap[dispatchId].emplace(wgId, LdsChunk(size)); + // make an entry for this workgroup + refCounter[dispatchId][wgId] = 0; + + return &chunkMap[dispatchId][wgId]; + } + + bool + returnQueuePush(std::pair thePair); + + Tick + earliestReturnTime() const + { + // TODO set to max(lastCommand+1, curTick()) + return returnQueue.empty() ? curTick() : returnQueue.back().first; + } + + void + setParent(ComputeUnit *x_parent); + + void + regStats(); + + // accessors + ComputeUnit * + getParent() const + { + return parent; + } + + std::string + getName() + { + return _name; + } + + int + getBanks() const + { + return banks; + } + + ComputeUnit * + getComputeUnit() const + { + return parent; + } + + int + getBankConflictPenalty() const + { + return bankConflictPenalty; + } + + /** + * get the allocated size for this workgroup + */ + std::size_t + ldsSize(const uint32_t x_wgId) + { + return chunkMap[x_wgId].size(); + } + + AddrRange + getAddrRange() const + { + return range; + } + + virtual BaseSlavePort & + getSlavePort(const std::string& if_name, PortID idx) + { + if (if_name == "cuPort") { + // TODO need to set name dynamically at this point? + return cuPort; + } else { + fatal("cannot resolve the port name " + if_name); + } + } + + /** + * can this much space be reserved for a workgroup? + */ + bool + canReserve(uint32_t x_size) const + { + return bytesAllocated + x_size <= maximumSize; + } + + private: + /** + * give back the space + */ + bool + releaseSpace(const uint32_t x_dispatchId, const uint32_t x_wgId) + { + auto dispatchIter = chunkMap.find(x_dispatchId); + + if (dispatchIter == chunkMap.end()) { + fatal("dispatch id not found [%d]", x_dispatchId); + } else { + auto workgroupIter = dispatchIter->second.find(x_wgId); + if (workgroupIter == dispatchIter->second.end()) { + fatal("workgroup id [%d] not found in dispatch id [%d]", + x_wgId, x_dispatchId); + } + } + + fatal_if(bytesAllocated < chunkMap[x_dispatchId][x_wgId].size(), + "releasing more space than was allocated"); + + bytesAllocated -= chunkMap[x_dispatchId][x_wgId].size(); + chunkMap[x_dispatchId].erase(chunkMap[x_dispatchId].find(x_wgId)); + return true; + } + + // the port that connects this LDS to its owner CU + CuSidePort cuPort; + + ComputeUnit* parent = nullptr; + + std::string _name; + + // the number of bytes currently reserved by all workgroups + int bytesAllocated = 0; + + // the size of the LDS, the most bytes available + int maximumSize; + + // Address range of this memory + AddrRange range; + + // the penalty, in cycles, for each LDS bank conflict + int bankConflictPenalty = 0; + + // the number of banks in the LDS underlying data store + int banks = 0; +}; + +#endif // __LDS_STATE_HH__ diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc new file mode 100644 index 000000000..7f919c5f4 --- /dev/null +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/local_memory_pipeline.hh" + +#include "debug/GPUPort.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/vector_register_file.hh" +#include "gpu-compute/wavefront.hh" + +LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p) : + computeUnit(nullptr), lmQueueSize(p->local_mem_queue_size) +{ +} + +void +LocalMemPipeline::init(ComputeUnit *cu) +{ + computeUnit = cu; + _name = computeUnit->name() + ".LocalMemPipeline"; +} + +void +LocalMemPipeline::exec() +{ + // apply any returned shared (LDS) memory operations + GPUDynInstPtr m = !lmReturnedRequests.empty() ? + lmReturnedRequests.front() : nullptr; + + bool accessVrf = true; + if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) { + Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; + + accessVrf = + w->computeUnit->vrf[m->simdId]-> + vrfOperandAccessReady(m->seqNum(), w, m, + VrfAccessType::WRITE); + } + + if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf && + computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return + || computeUnit->wfWait.at(m->pipeId).rdy())) { + if (m->v_type == VT_32 && m->m_type == Enums::M_U8) + doSmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_U16) + doSmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_U32) + doSmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S8) + doSmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S16) + doSmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S32) + doSmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_F16) + doSmReturn(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_F32) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U8) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U16) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U32) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U64) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S8) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S16) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S32) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S64) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F16) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F32) + doSmReturn(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F64) + doSmReturn(m); + } + + // If pipeline has executed a local memory instruction + // execute local memory packet and issue the packets + // to LDS + if (!lmIssuedRequests.empty() && lmReturnedRequests.size() < lmQueueSize) { + + GPUDynInstPtr m = lmIssuedRequests.front(); + + bool returnVal = computeUnit->sendToLds(m); + if (!returnVal) { + DPRINTF(GPUPort, "packet was nack'd and put in retry queue"); + } + lmIssuedRequests.pop(); + } +} + +template +void +LocalMemPipeline::doSmReturn(GPUDynInstPtr m) +{ + lmReturnedRequests.pop(); + Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; + + // Return data to registers + if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) { + std::vector regVec; + for (int k = 0; k < m->n_reg; ++k) { + int dst = m->dst_reg+k; + + if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) + dst = m->dst_reg_vec[k]; + // virtual->physical VGPR mapping + int physVgpr = w->remap(dst,sizeof(c0),1); + // save the physical VGPR index + regVec.push_back(physVgpr); + c1 *p1 = &((c1*)m->d_data)[k * VSZ]; + + for (int i = 0; i < VSZ; ++i) { + if (m->exec_mask[i]) { + // write the value into the physical VGPR. This is a purely + // functional operation. No timing is modeled. + w->computeUnit->vrf[w->simdId]->write(physVgpr, + *p1, i); + } + ++p1; + } + } + + // Schedule the write operation of the load data on the VRF. This simply + // models the timing aspect of the VRF write operation. It does not + // modify the physical VGPR. + loadVrfBankConflictCycles += + w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), w, + regVec, sizeof(c0), m->time); + } + + // Decrement outstanding request count + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1); + + if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) + || MO_H(m->m_op)) { + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_lm, + m->time, -1); + } + + if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_lm, + m->time, -1); + } + + // Mark write bus busy for appropriate amount of time + computeUnit->locMemToVrfBus.set(m->time); + if (computeUnit->shader->coissue_return == 0) + w->computeUnit->wfWait.at(m->pipeId).set(m->time); +} + +void +LocalMemPipeline::regStats() +{ + loadVrfBankConflictCycles + .name(name() + ".load_vrf_bank_conflict_cycles") + .desc("total number of cycles LDS data are delayed before updating " + "the VRF") + ; +} diff --git a/src/gpu-compute/local_memory_pipeline.hh b/src/gpu-compute/local_memory_pipeline.hh new file mode 100644 index 000000000..a63d867d0 --- /dev/null +++ b/src/gpu-compute/local_memory_pipeline.hh @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __LOCAL_MEMORY_PIPELINE_HH__ +#define __LOCAL_MEMORY_PIPELINE_HH__ + +#include +#include + +#include "gpu-compute/misc.hh" +#include "params/ComputeUnit.hh" +#include "sim/stats.hh" + +/* + * @file local_memory_pipeline.hh + * + * The local memory pipeline issues newly created local memory packets + * from pipeline to the LDS. This stage also retires previously issued + * loads and stores that have returned from the LDS. + */ + +class ComputeUnit; +class Wavefront; + +class LocalMemPipeline +{ + public: + LocalMemPipeline(const ComputeUnitParams *params); + void init(ComputeUnit *cu); + void exec(); + + template void doSmReturn(GPUDynInstPtr m); + + std::queue &getLMReqFIFO() { return lmIssuedRequests; } + std::queue &getLMRespFIFO() { return lmReturnedRequests; } + + bool + isLMRespFIFOWrRdy() const + { + return lmReturnedRequests.size() < lmQueueSize; + } + + bool + isLMReqFIFOWrRdy(uint32_t pendReqs=0) const + { + return (lmIssuedRequests.size() + pendReqs) < lmQueueSize; + } + + const std::string& name() const { return _name; } + void regStats(); + + private: + ComputeUnit *computeUnit; + std::string _name; + int lmQueueSize; + Stats::Scalar loadVrfBankConflictCycles; + // Local Memory Request Fifo: all shared memory requests + // are issued to this FIFO from the memory pipelines + std::queue lmIssuedRequests; + + // Local Memory Response Fifo: all responses of shared memory + // requests are sent to this FIFO from LDS + std::queue lmReturnedRequests; +}; + +#endif // __LOCAL_MEMORY_PIPELINE_HH__ diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh new file mode 100644 index 000000000..4f8032832 --- /dev/null +++ b/src/gpu-compute/misc.hh @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __MISC_HH__ +#define __MISC_HH__ + +#include +#include + +#include "base/misc.hh" + +class GPUDynInst; + +// wavefront size of the machine +static const int VSZ = 64; + +/* + This check is necessary because std::bitset only provides conversion to + unsigned long or unsigned long long via to_ulong() or to_ullong(). there are + a few places in the code where to_ullong() is used, however if VSZ is larger + than a value the host can support then bitset will throw a runtime exception. + + we should remove all use of to_long() or to_ullong() so we can have VSZ + greater than 64b, however until that is done this assert is required. + */ +static_assert(VSZ <= sizeof(unsigned long long) * 8, + "VSZ is larger than the host can support"); + +typedef std::bitset VectorMask; +typedef std::shared_ptr GPUDynInstPtr; + +class WaitClass +{ + public: + WaitClass() : nxtAvail(0), lookAheadAvail(0), tcnt(0) { } + void init(uint64_t *_tcnt, uint32_t _numStages=0) + { + tcnt = _tcnt; + numStages = _numStages; + } + + void set(uint32_t i) + { + fatal_if(nxtAvail > *tcnt, + "Can't allocate resource because it is busy!!!"); + nxtAvail = *tcnt + i; + } + void preset(uint32_t delay) + { + lookAheadAvail = std::max(lookAheadAvail, delay + (*tcnt) - numStages); + } + bool rdy() const { return *tcnt >= nxtAvail; } + bool prerdy() const { return *tcnt >= lookAheadAvail; } + + private: + // timestamp indicating when resource will be available + uint64_t nxtAvail; + // timestamp indicating when resource will be available including + // pending uses of the resource (when there is a cycle gap between + // rdy() and set() + uint64_t lookAheadAvail; + // current timestamp + uint64_t *tcnt; + // number of stages between checking if a resource is ready and + // setting the resource's utilization + uint32_t numStages; +}; + +class Float16 +{ + public: + uint16_t val; + + Float16() { val = 0; } + + Float16(const Float16 &x) : val(x.val) { } + + Float16(float x) + { + uint32_t ai = *(uint32_t *)&x; + + uint32_t s = (ai >> 31) & 0x1; + uint32_t exp = (ai >> 23) & 0xff; + uint32_t mant = (ai >> 0) & 0x7fffff; + + if (exp == 0 || exp <= 0x70) { + exp = 0; + mant = 0; + } else if (exp == 0xff) { + exp = 0x1f; + } else if (exp >= 0x8f) { + exp = 0x1f; + mant = 0; + } else { + exp = exp - 0x7f + 0x0f; + } + + mant = mant >> 13; + + val = 0; + val |= (s << 15); + val |= (exp << 10); + val |= (mant << 0); + } + + operator float() const + { + uint32_t s = (val >> 15) & 0x1; + uint32_t exp = (val >> 10) & 0x1f; + uint32_t mant = (val >> 0) & 0x3ff; + + if (!exp) { + exp = 0; + mant = 0; + } else if (exp == 0x1f) { + exp = 0xff; + } else { + exp = exp - 0x0f + 0x7f; + } + + uint32_t val1 = 0; + val1 |= (s << 31); + val1 |= (exp << 23); + val1 |= (mant << 13); + + return *(float*)&val1; + } +}; + +#endif // __MISC_HH__ diff --git a/src/gpu-compute/ndrange.hh b/src/gpu-compute/ndrange.hh new file mode 100644 index 000000000..d1ad35d4b --- /dev/null +++ b/src/gpu-compute/ndrange.hh @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __NDRANGE_HH__ +#define __NDRANGE_HH__ + +#include "base/types.hh" +#include "gpu-compute/qstruct.hh" + +struct NDRange +{ + // copy of the queue entry provided at dispatch + HsaQueueEntry q; + + // The current workgroup id (3 dimensions) + int wgId[3]; + // The number of workgroups in each dimension + int numWg[3]; + // The total number of workgroups + int numWgTotal; + + // The number of completed work groups + int numWgCompleted; + // The global workgroup ID + uint32_t globalWgId; + + // flag indicating whether all work groups have been launched + bool wg_disp_rem; + // kernel complete + bool execDone; + bool userDoorBellSet; + volatile bool *addrToNotify; + volatile uint32_t *numDispLeft; + int dispatchId; + int curTid; // Current thread id +}; + +#endif // __NDRANGE_HH__ diff --git a/src/gpu-compute/of_scheduling_policy.cc b/src/gpu-compute/of_scheduling_policy.cc new file mode 100644 index 000000000..7f114706a --- /dev/null +++ b/src/gpu-compute/of_scheduling_policy.cc @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/of_scheduling_policy.hh" + +#include "gpu-compute/wavefront.hh" + +Wavefront* +OFSchedulingPolicy::chooseWave() +{ + // Set when policy choose a wave to schedule + bool waveChosen = false; + Wavefront *selectedWave = nullptr; + int selectedWaveID = -1; + uint32_t selectedPosition = 0; + + for (int position = 0; position < scheduleList->size(); ++position) { + Wavefront *curWave = scheduleList->at(position); + uint32_t curWaveID = curWave->wfDynId; + + // Choosed wave with the lowest wave ID + if (selectedWaveID == -1 || curWaveID < selectedWaveID) { + waveChosen = true; + selectedWaveID = curWaveID; + selectedWave = curWave; + selectedPosition = position; + } + } + + // Check to make sure ready list had atleast one schedulable wave + if (waveChosen) { + scheduleList->erase(scheduleList->begin() + selectedPosition); + } else { + panic("Empty ready list"); + } + + return selectedWave; +} + +void +OFSchedulingPolicy::bindList(std::vector *list) +{ + scheduleList = list; +} diff --git a/src/gpu-compute/of_scheduling_policy.hh b/src/gpu-compute/of_scheduling_policy.hh new file mode 100644 index 000000000..684e51a3a --- /dev/null +++ b/src/gpu-compute/of_scheduling_policy.hh @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __OF_SCHEDULING_POLICY_HH__ +#define __OF_SCHEDULING_POLICY_HH__ + +#include +#include + +#include "base/misc.hh" + +class Wavefront; + +// Oldest First where age is marked by the wave id +class OFSchedulingPolicy +{ + public: + OFSchedulingPolicy() : scheduleList(nullptr) { } + + Wavefront* chooseWave(); + void bindList(std::vector *list); + + private: + // List of waves which are participating in scheduling. + // This scheduler selects the oldest wave from this list + std::vector *scheduleList; +}; + +#endif // __OF_SCHEDULING_POLICY_HH__ diff --git a/src/gpu-compute/pool_manager.cc b/src/gpu-compute/pool_manager.cc new file mode 100644 index 000000000..b1bc6b1f3 --- /dev/null +++ b/src/gpu-compute/pool_manager.cc @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#include "gpu-compute/pool_manager.hh" + +PoolManager::PoolManager(uint32_t minAlloc, uint32_t poolSize) + : _minAllocation(minAlloc), _poolSize(poolSize) +{ + assert(poolSize > 0); +} diff --git a/src/gpu-compute/pool_manager.hh b/src/gpu-compute/pool_manager.hh new file mode 100644 index 000000000..2cb53ce72 --- /dev/null +++ b/src/gpu-compute/pool_manager.hh @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#ifndef __POOL_MANAGER_HH__ +#define __POOL_MANAGER_HH__ + +#include +#include +#include + +// Pool Manager Logic +class PoolManager +{ + public: + PoolManager(uint32_t minAlloc, uint32_t poolSize); + uint32_t minAllocation() { return _minAllocation; } + virtual std::string printRegion() = 0; + virtual uint32_t regionSize(std::pair ®ion) = 0; + virtual bool canAllocate(uint32_t numRegions, uint32_t size) = 0; + + virtual uint32_t allocateRegion(const uint32_t size, + uint32_t *reserved) = 0; + + virtual void freeRegion(uint32_t firstIdx, uint32_t lastIdx) = 0; + uint32_t poolSize() { return _poolSize; } + + private: + // minimum size that can be reserved per allocation + uint32_t _minAllocation; + // pool size in number of elements + uint32_t _poolSize; +}; + +#endif // __POOL_MANAGER_HH__ diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh new file mode 100644 index 000000000..092303c00 --- /dev/null +++ b/src/gpu-compute/qstruct.hh @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Brad Beckmann, Marc Orr + */ + +#ifndef __Q_STRUCT_HH__ +#define __Q_STRUCT_HH__ + +#include +#include + +// Maximum number of arguments +static const int KER_NUM_ARGS = 32; +// Kernel argument buffer size +static const int KER_ARGS_LENGTH = 512; + +class LdsChunk; +struct NDRange; + +// Be very careful of alignment in this structure. The structure +// must compile to the same layout in both 32-bit and 64-bit mode. +struct HsaQueueEntry +{ + // Base pointer for array of instruction pointers + uint64_t code_ptr; + // Grid Size (3 dimensions) + uint32_t gdSize[3]; + // Workgroup Size (3 dimensions) + uint32_t wgSize[3]; + uint16_t sRegCount; + uint16_t dRegCount; + uint16_t cRegCount; + uint64_t privMemStart; + uint32_t privMemPerItem; + uint32_t privMemTotal; + uint64_t spillMemStart; + uint32_t spillMemPerItem; + uint32_t spillMemTotal; + uint64_t roMemStart; + uint32_t roMemTotal; + // Size (in bytes) of LDS + uint32_t ldsSize; + // Virtual Memory Id (unused right now) + uint32_t vmId; + + // Pointer to dependency chain (unused now) + uint64_t depends; + + // pointer to bool + uint64_t addrToNotify; + // pointer to uint32_t + uint64_t numDispLeft; + + // variables to pass arguments when running in standalone mode, + // will be removed when run.py and sh.cpp have been updated to + // use args and offset arrays + uint64_t arg1; + uint64_t arg2; + uint64_t arg3; + uint64_t arg4; + + // variables to pass arguments when running in cpu+gpu mode + uint8_t args[KER_ARGS_LENGTH]; + uint16_t offsets[KER_NUM_ARGS]; + uint16_t num_args; +}; + +// State used to start (or restart) a WF +struct WFContext +{ + // 32 bit values + // barrier state + int bar_cnt[VSZ]; + + // id (which WF in the WG) + int cnt; + + // more barrier state + int max_bar_cnt; + int old_barrier_cnt; + int barrier_cnt; + + // More Program Counter Stuff + uint32_t pc; + + // Program counter of the immediate post-dominator instruction + uint32_t rpc; + + // WG wide state (I don't see how to avoid redundancy here) + int cu_id; + uint32_t wg_id; + uint32_t barrier_id; + + // 64 bit values (these values depend on the wavefront size) + // masks + uint64_t init_mask; + uint64_t exec_mask; + + // private memory; + Addr privBase; + Addr spillBase; + + LdsChunk *ldsChunk; + + /* + * Kernel wide state + * This is a hack. This state should be moved through simulated memory + * during a yield. Though not much is being used here, so it's probably + * probably not a big deal. + * + * Just to add to this comment... The ndr is derived from simulated + * memory when the cl-runtime allocates an HsaQueueEntry and populates it + * for a kernel launch. So in theory the runtime should be able to keep + * that state around. Then a WF can reference it upon restart to derive + * kernel wide state. The runtime can deallocate the state when the + * kernel completes. + */ + NDRange *ndr; +}; + +// State that needs to be passed between the simulation and simulated app, a +// pointer to this struct can be passed through the depends field in the +// HsaQueueEntry struct +struct HostState +{ + // cl_event* has original HsaQueueEntry for init + uint64_t event; +}; + +// Total number of HSA queues +static const int HSAQ_NQUEUES = 8; + +// These values will eventually live in memory mapped registers +// and be settable by the kernel mode driver. + +// Number of entries in each HSA queue +static const int HSAQ_SIZE = 64; +// Address of first HSA queue index +static const int HSAQ_INDX_BASE = 0x10000ll; +// Address of first HSA queue +static const int HSAQ_BASE = 0x11000ll; +// Suggested start of HSA code +static const int HSA_CODE_BASE = 0x18000ll; + +// These are shortcuts for deriving the address of a specific +// HSA queue or queue index +#define HSAQ(n) (HSAQ_BASE + HSAQ_SIZE * sizeof(struct fsaQueue) * n) +#define HSAQE(n,i) (HSAQ_BASE + (HSAQ_SIZE * n + i) * sizeof(struct fsaQueue)) +#define HSAQ_RI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 0)) +#define HSAQ_WI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 1)) +#define HSAQ_CI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 2)) + +/* + * Example code for writing to a queue + * + * void + * ToQueue(int n,struct fsaQueue *val) + * { + * int wi = *(int*)HSAQ_WI(n); + * int ri = *(int*)HSAQ_RI(n); + * int ci = *(int*)HSAQ_CI(n); + * + * if (ci - ri < HSAQ_SIZE) { + * (*(int*)HSAQ_CI(n))++; + * *(HsaQueueEntry*)(HSAQE(n, (wi % HSAQ_SIZE))) = *val; + * (*(int*)HSAQ_WI(n))++; + * } + * } + */ + +#endif // __Q_STRUCT_HH__ diff --git a/src/gpu-compute/rr_scheduling_policy.cc b/src/gpu-compute/rr_scheduling_policy.cc new file mode 100644 index 000000000..5d3591901 --- /dev/null +++ b/src/gpu-compute/rr_scheduling_policy.cc @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/rr_scheduling_policy.hh" + +#include "gpu-compute/wavefront.hh" + +Wavefront* +RRSchedulingPolicy::chooseWave() +{ + Wavefront *selectedWave = nullptr; + + // Check to make sure ready list had atleast one schedulable wave + if (scheduleList->size()) { + // For RR policy, select the wave which is at the + // front of the list. The selected wave is popped + // out from the schedule list immediately after selection + // to avoid starvation. It is the responsibility of the + // module invoking the RR scheduler to make surei scheduling + // eligible waves are added to the back of the schedule + // list + selectedWave = scheduleList->front(); + scheduleList->erase(scheduleList->begin() + 0); + } else { + panic("Empty ready list"); + } + + return selectedWave; +} + +void +RRSchedulingPolicy::bindList(std::vector *list) +{ + scheduleList = list; +} diff --git a/src/gpu-compute/rr_scheduling_policy.hh b/src/gpu-compute/rr_scheduling_policy.hh new file mode 100644 index 000000000..780f294aa --- /dev/null +++ b/src/gpu-compute/rr_scheduling_policy.hh @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __RR_SCHEDULING_POLICY_HH__ +#define __RR_SCHEDULING_POLICY_HH__ + +#include + +#include +#include +#include + +#include "base/misc.hh" + +class Wavefront; + +// Round-Robin pick among the list of ready waves +class RRSchedulingPolicy +{ + public: + RRSchedulingPolicy() : scheduleList(nullptr) { } + + Wavefront* chooseWave(); + void bindList(std::vector *list); + + private: + // List of waves which are participating in scheduling. + // This scheduler selects one wave from this list based on + // round robin policy + std::vector *scheduleList; +}; + +#endif // __RR_SCHEDULING_POLICY_HH__ diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc new file mode 100644 index 000000000..068136026 --- /dev/null +++ b/src/gpu-compute/schedule_stage.cc @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/schedule_stage.hh" + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/vector_register_file.hh" +#include "gpu-compute/wavefront.hh" + +ScheduleStage::ScheduleStage(const ComputeUnitParams *p) + : numSIMDs(p->num_SIMDs), + numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes) +{ + for (int j = 0; j < numSIMDs + numMemUnits; ++j) { + Scheduler newScheduler(p); + scheduler.push_back(newScheduler); + } +} + +ScheduleStage::~ScheduleStage() +{ + scheduler.clear(); + waveStatusList.clear(); +} + +void +ScheduleStage::init(ComputeUnit *cu) +{ + computeUnit = cu; + _name = computeUnit->name() + ".ScheduleStage"; + + for (int j = 0; j < numSIMDs + numMemUnits; ++j) { + scheduler[j].bindList(&computeUnit->readyList[j]); + } + + for (int j = 0; j < numSIMDs; ++j) { + waveStatusList.push_back(&computeUnit->waveStatusList[j]); + } + + dispatchList = &computeUnit->dispatchList; +} + +void +ScheduleStage::arbitrate() +{ + // iterate over all Memory pipelines + for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) { + if (dispatchList->at(j).first) { + Wavefront *waveToMemPipe = dispatchList->at(j).first; + // iterate over all execution pipelines + for (int i = 0; i < numSIMDs + numMemUnits; ++i) { + if ((i != j) && (dispatchList->at(i).first)) { + Wavefront *waveToExePipe = dispatchList->at(i).first; + // if the two selected wavefronts are mapped to the same + // SIMD unit then they share the VRF + if (waveToMemPipe->simdId == waveToExePipe->simdId) { + int simdId = waveToMemPipe->simdId; + // Read VRF port arbitration: + // If there are read VRF port conflicts between the + // a memory and another instruction we drop the other + // instruction. We don't need to check for write VRF + // port conflicts because the memory instruction either + // does not need to write to the VRF (store) or will + // write to the VRF when the data comes back (load) in + // which case the arbiter of the memory pipes will + // resolve any conflicts + if (computeUnit->vrf[simdId]-> + isReadConflict(waveToMemPipe->wfSlotId, + waveToExePipe->wfSlotId)) { + // FIXME: The "second" member variable is never + // used in the model. I am setting it to READY + // simply to follow the protocol of setting it + // when the WF has an instruction ready to issue + waveStatusList[simdId]->at(waveToExePipe->wfSlotId) + .second = READY; + + dispatchList->at(i).first = nullptr; + dispatchList->at(i).second = EMPTY; + break; + } + } + } + } + } + } +} + +void +ScheduleStage::exec() +{ + for (int j = 0; j < numSIMDs + numMemUnits; ++j) { + uint32_t readyListSize = computeUnit->readyList[j].size(); + + // If no wave is ready to be scheduled on the execution resource + // then skip scheduling for this execution resource + if (!readyListSize) { + continue; + } + + Wavefront *waveToBeDispatched = scheduler[j].chooseWave(); + dispatchList->at(j).first = waveToBeDispatched; + waveToBeDispatched->updateResources(); + dispatchList->at(j).second = FILLED; + + waveStatusList[waveToBeDispatched->simdId]->at( + waveToBeDispatched->wfSlotId).second = BLOCKED; + + assert(computeUnit->readyList[j].size() == readyListSize - 1); + } + // arbitrate over all shared resources among instructions being issued + // simultaneously + arbitrate(); +} + +void +ScheduleStage::regStats() +{ +} diff --git a/src/gpu-compute/schedule_stage.hh b/src/gpu-compute/schedule_stage.hh new file mode 100644 index 000000000..26eb9a25b --- /dev/null +++ b/src/gpu-compute/schedule_stage.hh @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __SCHEDULE_STAGE_HH__ +#define __SCHEDULE_STAGE_HH__ + +#include +#include + +#include "gpu-compute/exec_stage.hh" +#include "gpu-compute/scheduler.hh" +#include "gpu-compute/scoreboard_check_stage.hh" + +// Schedule or execution arbitration stage. +// From the pool of ready waves in the ready list, +// one wave is selected for each execution resource. +// The selection is made based on a scheduling policy + +class ComputeUnit; +class Wavefront; + +struct ComputeUnitParams; + +class ScheduleStage +{ + public: + ScheduleStage(const ComputeUnitParams *params); + ~ScheduleStage(); + void init(ComputeUnit *cu); + void exec(); + void arbitrate(); + // Stats related variables and methods + std::string name() { return _name; } + void regStats(); + + private: + ComputeUnit *computeUnit; + uint32_t numSIMDs; + uint32_t numMemUnits; + + // Each execution resource will have its own + // scheduler and a dispatch list + std::vector scheduler; + + // Stores the status of waves. A READY implies the + // wave is ready to be scheduled this cycle and + // is already present in the readyList + std::vector>*> + waveStatusList; + + // List of waves which will be dispatched to + // each execution resource. A FILLED implies + // dispatch list is non-empty and + // execution unit has something to execute + // this cycle. Currently, the dispatch list of + // an execution resource can hold only one wave because + // an execution resource can execute only one wave in a cycle. + std::vector> *dispatchList; + + std::string _name; +}; + +#endif // __SCHEDULE_STAGE_HH__ diff --git a/src/gpu-compute/scheduler.cc b/src/gpu-compute/scheduler.cc new file mode 100644 index 000000000..1cd0bfe55 --- /dev/null +++ b/src/gpu-compute/scheduler.cc @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/scheduler.hh" + +Scheduler::Scheduler(const ComputeUnitParams *p) +{ + if (p->execPolicy == "OLDEST-FIRST") { + schedPolicy = SCHED_POLICY::OF_POLICY; + } else if (p->execPolicy == "ROUND-ROBIN") { + schedPolicy = SCHED_POLICY::RR_POLICY; + } else { + fatal("Unimplemented scheduling policy"); + } +} + +Wavefront* +Scheduler::chooseWave() +{ + if (schedPolicy == SCHED_POLICY::OF_POLICY) { + return OFSchedPolicy.chooseWave(); + } else if (schedPolicy == SCHED_POLICY::RR_POLICY) { + return RRSchedPolicy.chooseWave(); + } else { + fatal("Unimplemented scheduling policy"); + } +} + +void +Scheduler::bindList(std::vector *list) +{ + if (schedPolicy == SCHED_POLICY::OF_POLICY) { + OFSchedPolicy.bindList(list); + } else if (schedPolicy == SCHED_POLICY::RR_POLICY) { + RRSchedPolicy.bindList(list); + } else { + fatal("Unimplemented scheduling policy"); + } +} diff --git a/src/gpu-compute/scheduler.hh b/src/gpu-compute/scheduler.hh new file mode 100644 index 000000000..148ec9425 --- /dev/null +++ b/src/gpu-compute/scheduler.hh @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __SCHEDULER_HH__ +#define __SCHEDULER_HH__ + +#include "gpu-compute/of_scheduling_policy.hh" +#include "gpu-compute/rr_scheduling_policy.hh" +#include "gpu-compute/scheduling_policy.hh" +#include "params/ComputeUnit.hh" + +enum SCHED_POLICY +{ + OF_POLICY = 0, + RR_POLICY +}; + +class Scheduler +{ + public: + Scheduler(const ComputeUnitParams *params); + Wavefront *chooseWave(); + void bindList(std::vector *list); + + private: + SCHED_POLICY schedPolicy; + SchedulingPolicy RRSchedPolicy; + SchedulingPolicy OFSchedPolicy; +}; + +#endif // __SCHEDULER_HH__ diff --git a/src/gpu-compute/scheduling_policy.hh b/src/gpu-compute/scheduling_policy.hh new file mode 100644 index 000000000..b5e923c62 --- /dev/null +++ b/src/gpu-compute/scheduling_policy.hh @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __SCHEDULING_POLICY_HH__ +#define __SCHEDULING_POLICY_HH__ + +#include + +template +class SchedulingPolicy +{ + public: + Wavefront* chooseWave() { return policyImpl.chooseWave(); } + + void + bindList(std::vector *list) + { + return policyImpl.bindList(list); + } + + private: + Impl policyImpl; +}; + +#endif // __SCHEDULING_POLICY_HH__ diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc new file mode 100644 index 000000000..0d856a9b0 --- /dev/null +++ b/src/gpu-compute/scoreboard_check_stage.cc @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/scoreboard_check_stage.hh" + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" +#include "params/ComputeUnit.hh" + +ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p) + : numSIMDs(p->num_SIMDs), + numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes), + numGlbMemPipes(p->num_global_mem_pipes), + numShrMemPipes(p->num_shared_mem_pipes), + vectorAluInstAvail(nullptr), + lastGlbMemSimd(-1), + lastShrMemSimd(-1), glbMemInstAvail(nullptr), + shrMemInstAvail(nullptr) +{ +} + +ScoreboardCheckStage::~ScoreboardCheckStage() +{ + readyList.clear(); + waveStatusList.clear(); + shrMemInstAvail = nullptr; + glbMemInstAvail = nullptr; +} + +void +ScoreboardCheckStage::init(ComputeUnit *cu) +{ + computeUnit = cu; + _name = computeUnit->name() + ".ScoreboardCheckStage"; + + for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) { + readyList.push_back(&computeUnit->readyList[unitId]); + } + + for (int unitId = 0; unitId < numSIMDs; ++unitId) { + waveStatusList.push_back(&computeUnit->waveStatusList[unitId]); + } + + vectorAluInstAvail = &computeUnit->vectorAluInstAvail; + glbMemInstAvail= &computeUnit->glbMemInstAvail; + shrMemInstAvail= &computeUnit->shrMemInstAvail; +} + +void +ScoreboardCheckStage::initStatistics() +{ + lastGlbMemSimd = -1; + lastShrMemSimd = -1; + *glbMemInstAvail = 0; + *shrMemInstAvail = 0; + + for (int unitId = 0; unitId < numSIMDs; ++unitId) + vectorAluInstAvail->at(unitId) = false; +} + +void +ScoreboardCheckStage::collectStatistics(Wavefront *curWave, int unitId) +{ + if (curWave->instructionBuffer.empty()) + return; + + // track which vector SIMD unit has at least one WV with a vector + // ALU as the oldest instruction in its Instruction buffer + vectorAluInstAvail->at(unitId) = vectorAluInstAvail->at(unitId) || + curWave->isOldestInstALU(); + + // track how many vector SIMD units have at least one WV with a + // vector Global memory instruction as the oldest instruction + // in its Instruction buffer + if ((curWave->isOldestInstGMem() || curWave->isOldestInstPrivMem() || + curWave->isOldestInstFlatMem()) && lastGlbMemSimd != unitId && + *glbMemInstAvail <= 1) { + (*glbMemInstAvail)++; + lastGlbMemSimd = unitId; + } + + // track how many vector SIMD units have at least one WV with a + // vector shared memory (LDS) instruction as the oldest instruction + // in its Instruction buffer + // TODO: parametrize the limit of the LDS units + if (curWave->isOldestInstLMem() && (*shrMemInstAvail <= numShrMemPipes) && + lastShrMemSimd != unitId) { + (*shrMemInstAvail)++; + lastShrMemSimd = unitId; + } +} + +void +ScoreboardCheckStage::exec() +{ + initStatistics(); + + // reset the ready list for all execution units; it will be + // constructed every cycle since resource availability may change + for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) { + readyList[unitId]->clear(); + } + + // iterate over the Wavefronts of all SIMD units + for (int unitId = 0; unitId < numSIMDs; ++unitId) { + for (int wvId = 0; wvId < computeUnit->shader->n_wf; ++wvId) { + // reset the ready status of each wavefront + waveStatusList[unitId]->at(wvId).second = BLOCKED; + Wavefront *curWave = waveStatusList[unitId]->at(wvId).first; + collectStatistics(curWave, unitId); + + if (curWave->ready(Wavefront::I_ALU)) { + readyList[unitId]->push_back(curWave); + waveStatusList[unitId]->at(wvId).second = READY; + } else if (curWave->ready(Wavefront::I_GLOBAL)) { + if (computeUnit->cedeSIMD(unitId, wvId)) { + continue; + } + + readyList[computeUnit->GlbMemUnitId()]->push_back(curWave); + waveStatusList[unitId]->at(wvId).second = READY; + } else if (curWave->ready(Wavefront::I_SHARED)) { + readyList[computeUnit->ShrMemUnitId()]->push_back(curWave); + waveStatusList[unitId]->at(wvId).second = READY; + } else if (curWave->ready(Wavefront::I_FLAT)) { + readyList[computeUnit->GlbMemUnitId()]->push_back(curWave); + waveStatusList[unitId]->at(wvId).second = READY; + } else if (curWave->ready(Wavefront::I_PRIVATE)) { + readyList[computeUnit->GlbMemUnitId()]->push_back(curWave); + waveStatusList[unitId]->at(wvId).second = READY; + } + } + } +} + +void +ScoreboardCheckStage::regStats() +{ +} diff --git a/src/gpu-compute/scoreboard_check_stage.hh b/src/gpu-compute/scoreboard_check_stage.hh new file mode 100644 index 000000000..099597afb --- /dev/null +++ b/src/gpu-compute/scoreboard_check_stage.hh @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __SCOREBOARD_CHECK_STAGE_HH__ +#define __SCOREBOARD_CHECK_STAGE_HH__ + +#include +#include +#include +#include + +class ComputeUnit; +class Wavefront; + +struct ComputeUnitParams; + +enum WAVE_STATUS +{ + BLOCKED = 0, + READY +}; + +/* + * Scoreboard check stage. + * All wavefronts are analyzed to see if they are ready + * to be executed this cycle. Both structural and data + * hazards are considered while marking a wave "ready" + * for execution. After analysis, the ready waves are + * added to readyList. + */ +class ScoreboardCheckStage +{ + public: + ScoreboardCheckStage(const ComputeUnitParams* params); + ~ScoreboardCheckStage(); + void init(ComputeUnit *cu); + void exec(); + + // Stats related variables and methods + const std::string& name() const { return _name; } + void regStats(); + + private: + void collectStatistics(Wavefront *curWave, int unitId); + void initStatistics(); + ComputeUnit *computeUnit; + uint32_t numSIMDs; + uint32_t numMemUnits; + uint32_t numGlbMemPipes; + uint32_t numShrMemPipes; + + // flag per vector SIMD unit that is set when there is at least one + // WF that has a vector ALU instruction as the oldest in its + // Instruction Buffer + std::vector *vectorAluInstAvail; + int lastGlbMemSimd; + int lastShrMemSimd; + + int *glbMemInstAvail; + int *shrMemInstAvail; + // List of waves which are ready to be scheduled. + // Each execution resource has a ready list + std::vector*> readyList; + + // Stores the status of waves. A READY implies the + // wave is ready to be scheduled this cycle and + // is already present in the readyList + std::vector>*> + waveStatusList; + + std::string _name; +}; + +#endif // __SCOREBOARD_CHECK_STAGE_HH__ diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc new file mode 100644 index 000000000..e8d7946ff --- /dev/null +++ b/src/gpu-compute/shader.cc @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "gpu-compute/shader.hh" + +#include + +#include "arch/x86/linux/linux.hh" +#include "base/chunk_generator.hh" +#include "debug/GPUDisp.hh" +#include "debug/GPUMem.hh" +#include "debug/HSAIL.hh" +#include "gpu-compute/dispatcher.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/qstruct.hh" +#include "gpu-compute/wavefront.hh" +#include "mem/packet.hh" +#include "mem/ruby/system/RubySystem.hh" +#include "sim/sim_exit.hh" + +Shader::Shader(const Params *p) : SimObject(p), + clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr), + cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing), + hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync), + separate_acquire_release(p->separate_acquire_release), coissue_return(1), + trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf), + globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0), + box_tick_cnt(0), start_tick_cnt(0) +{ + + cuList.resize(n_cu); + + for (int i = 0; i < n_cu; ++i) { + cuList[i] = p->CUs[i]; + assert(i == cuList[i]->cu_id); + cuList[i]->shader = this; + } +} + +Addr +Shader::mmap(int length) +{ + + Addr start; + + // round up length to the next page + length = roundUp(length, TheISA::PageBytes); + + if (X86Linux64::mmapGrowsDown()) { + DPRINTF(HSAIL, "GROWS DOWN"); + start = gpuTc->getProcessPtr()->mmap_end -length; + gpuTc->getProcessPtr()->mmap_end = start; + } else { + DPRINTF(HSAIL, "GROWS UP"); + start = gpuTc->getProcessPtr()->mmap_end; + gpuTc->getProcessPtr()->mmap_end += length; + + // assertion to make sure we don't overwrite the stack (it grows down) + assert(gpuTc->getProcessPtr()->mmap_end < + gpuTc->getProcessPtr()->stack_base - + gpuTc->getProcessPtr()->max_stack_size); + + } + + DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length); + + gpuTc->getProcessPtr()->allocateMem(start,length); + + return start; +} + +void +Shader::init() +{ + // grab the threadContext of the thread running on the CPU + assert(cpuPointer); + gpuTc = cpuPointer->getContext(0); + assert(gpuTc); +} + +Shader::~Shader() +{ + for (int j = 0; j < n_cu; ++j) + delete cuList[j]; +} + +void +Shader::updateThreadContext(int tid) { + // thread context of the thread which dispatched work + assert(cpuPointer); + gpuTc = cpuPointer->getContext(tid); + assert(gpuTc); +} + +void +Shader::hostWakeUp(BaseCPU *cpu) { + if (cpuPointer == cpu) { + if (gpuTc->status() == ThreadContext::Suspended) + cpu->activateContext(gpuTc->threadId()); + } else { + //Make sure both dispatcher and shader are trying to + //wakeup same host. Hack here to enable kernel launch + //from multiple CPUs + panic("Dispatcher wants to wakeup a different host"); + } +} + +Shader* +ShaderParams::create() +{ + return new Shader(this); +} + +void +Shader::exec() +{ + tick_cnt = curTick(); + box_tick_cnt = curTick() - start_tick_cnt; + + // apply any scheduled adds + for (int i = 0; i < sa_n; ++i) { + if (sa_when[i] <= tick_cnt) { + *sa_val[i] += sa_x[i]; + sa_val.erase(sa_val.begin() + i); + sa_x.erase(sa_x.begin() + i); + sa_when.erase(sa_when.begin() + i); + --sa_n; + --i; + } + } + + // clock all of the cu's + for (int i = 0; i < n_cu; ++i) + cuList[i]->exec(); +} + +bool +Shader::dispatch_workgroups(NDRange *ndr) +{ + bool scheduledSomething = false; + int cuCount = 0; + int curCu = nextSchedCu; + + while (cuCount < n_cu) { + //Every time we try a CU, update nextSchedCu + nextSchedCu = (nextSchedCu + 1) % n_cu; + + // dispatch workgroup iff the following two conditions are met: + // (a) wg_rem is true - there are unassigned workgroups in the grid + // (b) there are enough free slots in cu cuList[i] for this wg + if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) { + scheduledSomething = true; + DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu); + + // ticks() member function translates cycles to simulation ticks. + if (!tickEvent.scheduled()) { + schedule(tickEvent, curTick() + this->ticks(1)); + } + + cuList[curCu]->StartWorkgroup(ndr); + ndr->wgId[0]++; + ndr->globalWgId++; + if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) { + ndr->wgId[0] = 0; + ndr->wgId[1]++; + + if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) { + ndr->wgId[1] = 0; + ndr->wgId[2]++; + + if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) { + ndr->wg_disp_rem = false; + break; + } + } + } + } + + ++cuCount; + curCu = nextSchedCu; + } + + return scheduledSomething; +} + +void +Shader::handshake(GpuDispatcher *_dispatcher) +{ + dispatcher = _dispatcher; +} + +void +Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data, + bool suppress_func_errors, int cu_id) +{ + unsigned block_size = RubySystem::getBlockSizeBytes(); + unsigned size = req->getSize(); + + Addr tmp_addr; + BaseTLB::Mode trans_mode; + + if (cmd == MemCmd::ReadReq) { + trans_mode = BaseTLB::Read; + } else if (cmd == MemCmd::WriteReq) { + trans_mode = BaseTLB::Write; + } else { + fatal("unexcepted MemCmd\n"); + } + + tmp_addr = req->getVaddr(); + Addr split_addr = roundDown(tmp_addr + size - 1, block_size); + + assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size); + + // Misaligned access + if (split_addr > tmp_addr) { + RequestPtr req1, req2; + req->splitOnVaddr(split_addr, req1, req2); + + + PacketPtr pkt1 = new Packet(req2, cmd); + PacketPtr pkt2 = new Packet(req1, cmd); + + functionalTLBAccess(pkt1, cu_id, trans_mode); + functionalTLBAccess(pkt2, cu_id, trans_mode); + + PacketPtr new_pkt1 = new Packet(pkt1->req, cmd); + PacketPtr new_pkt2 = new Packet(pkt2->req, cmd); + + new_pkt1->dataStatic(data); + new_pkt2->dataStatic((uint8_t*)data + req1->getSize()); + + if (suppress_func_errors) { + new_pkt1->setSuppressFuncError(); + new_pkt2->setSuppressFuncError(); + } + + // fixme: this should be cuList[cu_id] if cu_id != n_cu + // The latter requires a memPort in the dispatcher + cuList[0]->memPort[0]->sendFunctional(new_pkt1); + cuList[0]->memPort[0]->sendFunctional(new_pkt2); + + delete new_pkt1; + delete new_pkt2; + delete pkt1; + delete pkt2; + } else { + PacketPtr pkt = new Packet(req, cmd); + functionalTLBAccess(pkt, cu_id, trans_mode); + PacketPtr new_pkt = new Packet(pkt->req, cmd); + new_pkt->dataStatic(data); + + if (suppress_func_errors) { + new_pkt->setSuppressFuncError(); + }; + + // fixme: this should be cuList[cu_id] if cu_id != n_cu + // The latter requires a memPort in the dispatcher + cuList[0]->memPort[0]->sendFunctional(new_pkt); + + delete new_pkt; + delete pkt; + } +} + +bool +Shader::busy() +{ + for (int i_cu = 0; i_cu < n_cu; ++i_cu) { + if (!cuList[i_cu]->isDone()) { + return true; + } + } + + return false; +} + +void +Shader::ScheduleAdd(uint32_t *val,Tick when,int x) +{ + sa_val.push_back(val); + sa_when.push_back(tick_cnt + when); + sa_x.push_back(x); + ++sa_n; +} + +Shader::TickEvent::TickEvent(Shader *_shader) + : Event(CPU_Tick_Pri), shader(_shader) +{ +} + + +void +Shader::TickEvent::process() +{ + if (shader->busy()) { + shader->exec(); + shader->schedule(this, curTick() + shader->ticks(1)); + } +} + +const char* +Shader::TickEvent::description() const +{ + return "Shader tick"; +} + +void +Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, + MemCmd cmd, bool suppress_func_errors) +{ + uint8_t *data_buf = (uint8_t*)ptr; + + for (ChunkGenerator gen(address, size, RubySystem::getBlockSizeBytes()); + !gen.done(); gen.next()) { + Request *req = new Request(0, gen.addr(), gen.size(), 0, + cuList[0]->masterId(), 0, 0, 0); + + doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id); + data_buf += gen.size(); + delete req; + } +} + +void +Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id) +{ + AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false); +} + +void +Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id, + bool suppress_func_errors) +{ + AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors); +} + +void +Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id) +{ + AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false); +} + +void +Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id, + bool suppress_func_errors) +{ + AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, + suppress_func_errors); +} + +/* + * Send a packet through the appropriate TLB functional port. + * If cu_id=n_cu, then this is the dispatcher's TLB. + * Otherwise it's the TLB of the cu_id compute unit. + */ +void +Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode) +{ + // update senderState. Need to know the gpuTc and the TLB mode + pkt->senderState = + new TheISA::GpuTLB::TranslationState(mode, gpuTc, false); + + if (cu_id == n_cu) { + dispatcher->tlbPort->sendFunctional(pkt); + } else { + // even when the perLaneTLB flag is turned on + // it's ok tp send all accesses through lane 0 + // since the lane # is not known here, + // This isn't important since these are functional accesses. + cuList[cu_id]->tlbPort[0]->sendFunctional(pkt); + } + + /* safe_cast the senderState */ + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast(pkt->senderState); + + delete sender_state->tlbEntry; + delete pkt->senderState; +} diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh new file mode 100644 index 000000000..91ea8aae0 --- /dev/null +++ b/src/gpu-compute/shader.hh @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __SHADER_HH__ +#define __SHADER_HH__ + +#include +#include + +#include "arch/isa.hh" +#include "arch/isa_traits.hh" +#include "base/types.hh" +#include "cpu/simple/atomic.hh" +#include "cpu/simple/timing.hh" +#include "cpu/simple_thread.hh" +#include "cpu/thread_context.hh" +#include "cpu/thread_state.hh" +#include "enums/MemOpType.hh" +#include "enums/MemType.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_tlb.hh" +#include "gpu-compute/lds_state.hh" +#include "gpu-compute/qstruct.hh" +#include "mem/page_table.hh" +#include "mem/port.hh" +#include "mem/request.hh" +#include "params/Shader.hh" +#include "sim/faults.hh" +#include "sim/process.hh" +#include "sim/sim_object.hh" + +class BaseTLB; +class GpuDispatcher; + +namespace TheISA +{ + class GpuTLB; +} + +static const int LDS_SIZE = 65536; + +// Class Shader: This describes a single shader instance. Most +// configurations will only have a single shader. + +class Shader : public SimObject +{ + protected: + // Shader's clock period in terms of number of ticks of curTime, + // aka global simulation clock + Tick clock; + + public: + typedef ShaderParams Params; + enum hsail_mode_e {SIMT,VECTOR_SCALAR}; + + // clock related functions ; maps to-and-from + // Simulation ticks and shader clocks. + Tick frequency() const { return SimClock::Frequency / clock; } + + Tick ticks(int numCycles) const { return (Tick)clock * numCycles; } + + Tick getClock() const { return clock; } + Tick curCycle() const { return curTick() / clock; } + Tick tickToCycles(Tick val) const { return val / clock;} + + + SimpleThread *cpuThread; + ThreadContext *gpuTc; + BaseCPU *cpuPointer; + + class TickEvent : public Event + { + private: + Shader *shader; + + public: + TickEvent(Shader*); + void process(); + const char* description() const; + }; + + TickEvent tickEvent; + + // is this simulation going to be timing mode in the memory? + bool timingSim; + hsail_mode_e hsail_mode; + + // If set, issue acq packet @ kernel launch + int impl_kern_boundary_sync; + // If set, generate a separate packet for acquire/release on + // ld_acquire/st_release/atomic operations + int separate_acquire_release; + // If set, fetch returns may be coissued with instructions + int coissue_return; + // If set, always dump all 64 gprs to trace + int trace_vgpr_all; + // Number of cu units in the shader + int n_cu; + // Number of wavefront slots per cu + int n_wf; + // The size of global memory + int globalMemSize; + + /* + * Bytes/work-item for call instruction + * The number of arguments for an hsail function will + * vary. We simply determine the maximum # of arguments + * required by any hsail function up front before the + * simulation (during parsing of the Brig) and record + * that number here. + */ + int funcargs_size; + + // Tracks CU that rr dispatcher should attempt scheduling + int nextSchedCu; + + // Size of scheduled add queue + uint32_t sa_n; + + // Pointer to value to be increments + std::vector sa_val; + // When to do the increment + std::vector sa_when; + // Amount to increment by + std::vector sa_x; + + // List of Compute Units (CU's) + std::vector cuList; + + uint64_t tick_cnt; + uint64_t box_tick_cnt; + uint64_t start_tick_cnt; + + GpuDispatcher *dispatcher; + + Shader(const Params *p); + ~Shader(); + virtual void init(); + + // Run shader + void exec(); + + // Check to see if shader is busy + bool busy(); + + // Schedule a 32-bit value to be incremented some time in the future + void ScheduleAdd(uint32_t *val, Tick when, int x); + bool processTimingPacket(PacketPtr pkt); + + void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, + MemCmd cmd, bool suppress_func_errors); + + void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id); + + void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id, + bool suppress_func_errors); + + void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id); + + void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id, + bool suppress_func_errors); + + void doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data, + bool suppress_func_errors, int cu_id); + + void + registerCU(int cu_id, ComputeUnit *compute_unit) + { + cuList[cu_id] = compute_unit; + } + + void handshake(GpuDispatcher *dispatcher); + bool dispatch_workgroups(NDRange *ndr); + Addr mmap(int length); + void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode); + void updateThreadContext(int tid); + void hostWakeUp(BaseCPU *cpu); +}; + +#endif // __SHADER_HH__ diff --git a/src/gpu-compute/simple_pool_manager.cc b/src/gpu-compute/simple_pool_manager.cc new file mode 100644 index 000000000..0e35ab9cc --- /dev/null +++ b/src/gpu-compute/simple_pool_manager.cc @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#include "gpu-compute/simple_pool_manager.hh" + +#include "base/misc.hh" + +// return the min number of elements that the manager can reserve given +// a request for "size" elements +uint32_t +SimplePoolManager::minAllocatedElements(uint32_t size) +{ + fatal_if(size <= 0 || size > poolSize(), "Illegal VGPR region size=%d\n", + size); + + return size % minAllocation() > 0 ? + (minAllocation() - (size % minAllocation())) + size : size; +} + +std::string +SimplePoolManager::printRegion() +{ + std::string _cout; + if (_reservedGroups == 0) + _cout = "VRF is empty\n"; + else if (_reservedGroups > 0) { + uint32_t reservedEntries = _reservedGroups * _regionSize; + _cout = "VRF reserves " + std::to_string(reservedEntries) + " VGPRs\n"; + } + + return _cout; +} + +bool +SimplePoolManager::canAllocate(uint32_t numRegions, uint32_t size) +{ + assert(numRegions * minAllocatedElements(size) <= poolSize()); + + return _reservedGroups == 0; +} + +void +SimplePoolManager::freeRegion(uint32_t firstIdx, uint32_t lastIdx) +{ + assert(_reservedGroups > 0); + --_reservedGroups; + + if (!_reservedGroups) + _nxtFreeIdx = 0; +} + +uint32_t +SimplePoolManager::allocateRegion(const uint32_t size, + uint32_t *reservedPoolSize) +{ + uint32_t actualSize = minAllocatedElements(size); + uint32_t startIdx = _nxtFreeIdx; + _nxtFreeIdx += actualSize; + _regionSize = actualSize; + assert(_nxtFreeIdx < poolSize()); + *reservedPoolSize = actualSize; + ++_reservedGroups; + + return startIdx; +} + +uint32_t +SimplePoolManager::regionSize(std::pair ®ion) +{ + bool wrapAround = (region.first > region.second); + if (!wrapAround) { + return region.second - region.first + 1; + } else { + return region.second + poolSize() - region.first + 1; + } +} diff --git a/src/gpu-compute/simple_pool_manager.hh b/src/gpu-compute/simple_pool_manager.hh new file mode 100644 index 000000000..1d4174da8 --- /dev/null +++ b/src/gpu-compute/simple_pool_manager.hh @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#ifndef __SIMPLE_POOL_MANAGER_HH__ +#define __SIMPLE_POOL_MANAGER_HH__ + +#include +#include + +#include "gpu-compute/pool_manager.hh" + +// Simple Pool Manager: allows one region per pool. No region merging is +// supported. +class SimplePoolManager : public PoolManager +{ + public: + SimplePoolManager(uint32_t minAlloc, uint32_t poolSize) + : PoolManager(minAlloc, poolSize), _regionSize(0), _nxtFreeIdx(0), + _reservedGroups(0) + { + } + + uint32_t minAllocatedElements(uint32_t size); + std::string printRegion(); + bool canAllocate(uint32_t numRegions, uint32_t size); + uint32_t allocateRegion(const uint32_t size, uint32_t *reservedPoolSize); + void freeRegion(uint32_t firstIdx, uint32_t lastIdx); + uint32_t regionSize(std::pair ®ion); + + private: + // actual size of a region (normalized to the minimum size that can + // be reserved) + uint32_t _regionSize; + // next index to allocate a region + uint8_t _nxtFreeIdx; + // number of groups that reserve a region + uint32_t _reservedGroups; +}; + +#endif // __SIMPLE_POOL_MANAGER_HH__ diff --git a/src/gpu-compute/tlb_coalescer.cc b/src/gpu-compute/tlb_coalescer.cc new file mode 100644 index 000000000..835d7b740 --- /dev/null +++ b/src/gpu-compute/tlb_coalescer.cc @@ -0,0 +1,583 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#include "gpu-compute/tlb_coalescer.hh" + +#include + +#include "debug/GPUTLB.hh" + +TLBCoalescer::TLBCoalescer(const Params *p) : MemObject(p), + clock(p->clk_domain->clockPeriod()), TLBProbesPerCycle(p->probesPerCycle), + coalescingWindow(p->coalescingWindow), + disableCoalescing(p->disableCoalescing), probeTLBEvent(this), + cleanupEvent(this) +{ + // create the slave ports based on the number of connected ports + for (size_t i = 0; i < p->port_slave_connection_count; ++i) { + cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i), + this, i)); + } + + // create the master ports based on the number of connected ports + for (size_t i = 0; i < p->port_master_connection_count; ++i) { + memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i), + this, i)); + } +} + +BaseSlavePort& +TLBCoalescer::getSlavePort(const std::string &if_name, PortID idx) +{ + if (if_name == "slave") { + if (idx >= static_cast(cpuSidePort.size())) { + panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx); + } + + return *cpuSidePort[idx]; + } else { + panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name); + } +} + +BaseMasterPort& +TLBCoalescer::getMasterPort(const std::string &if_name, PortID idx) +{ + if (if_name == "master") { + if (idx >= static_cast(memSidePort.size())) { + panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx); + } + + return *memSidePort[idx]; + } else { + panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name); + } +} + +/* + * This method returns true if the + * can be coalesced with and false otherwise. + * A given set of rules is checked. + * The rules can potentially be modified based on the TLB level. + */ +bool +TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt) +{ + if (disableCoalescing) + return false; + + TheISA::GpuTLB::TranslationState *incoming_state = + safe_cast(incoming_pkt->senderState); + + TheISA::GpuTLB::TranslationState *coalesced_state = + safe_cast(coalesced_pkt->senderState); + + // Rule 1: Coalesce requests only if they + // fall within the same virtual page + Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(), + TheISA::PageBytes); + + Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(), + TheISA::PageBytes); + + if (incoming_virt_page_addr != coalesced_virt_page_addr) + return false; + + //* Rule 2: Coalesce requests only if they + // share a TLB Mode, i.e. they are both read + // or write requests. + BaseTLB::Mode incoming_mode = incoming_state->tlbMode; + BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode; + + if (incoming_mode != coalesced_mode) + return false; + + // when we can coalesce a packet update the reqCnt + // that is the number of packets represented by + // this coalesced packet + if (!incoming_state->prefetch) + coalesced_state->reqCnt.back() += incoming_state->reqCnt.back(); + + return true; +} + +/* + * We need to update the physical addresses of all the translation requests + * that were coalesced into the one that just returned. + */ +void +TLBCoalescer::updatePhysAddresses(PacketPtr pkt) +{ + Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); + + DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n", + issuedTranslationsTable[virt_page_addr].size(), virt_page_addr); + + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast(pkt->senderState); + + TheISA::GpuTlbEntry *tlb_entry = sender_state->tlbEntry; + assert(tlb_entry); + Addr first_entry_vaddr = tlb_entry->vaddr; + Addr first_entry_paddr = tlb_entry->paddr; + int page_size = tlb_entry->size(); + bool uncacheable = tlb_entry->uncacheable; + int first_hit_level = sender_state->hitLevel; + bool valid = tlb_entry->valid; + + // Get the physical page address of the translated request + // Using the page_size specified in the TLBEntry allows us + // to support different page sizes. + Addr phys_page_paddr = pkt->req->getPaddr(); + phys_page_paddr &= ~(page_size - 1); + + for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) { + PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i]; + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast( + local_pkt->senderState); + + // we are sending the packet back, so pop the reqCnt associated + // with this level in the TLB hiearchy + if (!sender_state->prefetch) + sender_state->reqCnt.pop_back(); + + /* + * Only the first packet from this coalesced request has been + * translated. Grab the translated phys. page addr and update the + * physical addresses of the remaining packets with the appropriate + * page offsets. + */ + if (i) { + Addr paddr = phys_page_paddr; + paddr |= (local_pkt->req->getVaddr() & (page_size - 1)); + local_pkt->req->setPaddr(paddr); + + if (uncacheable) + local_pkt->req->setFlags(Request::UNCACHEABLE); + + // update senderState->tlbEntry, so we can insert + // the correct TLBEentry in the TLBs above. + sender_state->tlbEntry = + new TheISA::GpuTlbEntry(0, first_entry_vaddr, first_entry_paddr, + valid); + + // update the hitLevel for all uncoalesced reqs + // so that each packet knows where it hit + // (used for statistics in the CUs) + sender_state->hitLevel = first_hit_level; + } + + SlavePort *return_port = sender_state->ports.back(); + sender_state->ports.pop_back(); + + // Translation is done - Convert to a response pkt if necessary and + // send the translation back + if (local_pkt->isRequest()) { + local_pkt->makeTimingResponse(); + } + + return_port->sendTimingResp(local_pkt); + } + + // schedule clean up for end of this cycle + // This is a maximum priority event and must be on + // the same cycle as GPUTLB cleanup event to prevent + // race conditions with an IssueProbeEvent caused by + // MemSidePort::recvReqRetry + cleanupQueue.push(virt_page_addr); + + if (!cleanupEvent.scheduled()) + schedule(cleanupEvent, curTick()); +} + +// Receive translation requests, create a coalesced request, +// and send them to the TLB (TLBProbesPerCycle) +bool +TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt) +{ + // first packet of a coalesced request + PacketPtr first_packet = nullptr; + // true if we are able to do coalescing + bool didCoalesce = false; + // number of coalesced reqs for a given window + int coalescedReq_cnt = 0; + + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast(pkt->senderState); + + // push back the port to remember the path back + sender_state->ports.push_back(this); + + bool update_stats = !sender_state->prefetch; + + if (update_stats) { + // if reqCnt is empty then this packet does not represent + // multiple uncoalesced reqs(pkts) but just a single pkt. + // If it does though then the reqCnt for each level in the + // hierarchy accumulates the total number of reqs this packet + // represents + int req_cnt = 1; + + if (!sender_state->reqCnt.empty()) + req_cnt = sender_state->reqCnt.back(); + + sender_state->reqCnt.push_back(req_cnt); + + // update statistics + coalescer->uncoalescedAccesses++; + req_cnt = sender_state->reqCnt.back(); + DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt); + coalescer->queuingCycles -= (curTick() * req_cnt); + coalescer->localqueuingCycles -= curTick(); + } + + // FIXME if you want to coalesce not based on the issueTime + // of the packets (i.e., from the compute unit's perspective) + // but based on when they reached this coalescer then + // remove the following if statement and use curTick() or + // coalescingWindow for the tick_index. + if (!sender_state->issueTime) + sender_state->issueTime = curTick(); + + // The tick index is used as a key to the coalescerFIFO hashmap. + // It is shared by all candidates that fall within the + // given coalescingWindow. + int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow; + + if (coalescer->coalescerFIFO.count(tick_index)) { + coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size(); + } + + // see if we can coalesce the incoming pkt with another + // coalesced request with the same tick_index + for (int i = 0; i < coalescedReq_cnt; ++i) { + first_packet = coalescer->coalescerFIFO[tick_index][i][0]; + + if (coalescer->canCoalesce(pkt, first_packet)) { + coalescer->coalescerFIFO[tick_index][i].push_back(pkt); + + DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n", + i, tick_index, + coalescer->coalescerFIFO[tick_index][i].size()); + + didCoalesce = true; + break; + } + } + + // if this is the first request for this tick_index + // or we did not manage to coalesce, update stats + // and make necessary allocations. + if (!coalescedReq_cnt || !didCoalesce) { + if (update_stats) + coalescer->coalescedAccesses++; + + std::vector new_array; + new_array.push_back(pkt); + coalescer->coalescerFIFO[tick_index].push_back(new_array); + + DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after " + "push\n", tick_index, + coalescer->coalescerFIFO[tick_index].size()); + } + + //schedule probeTLBEvent next cycle to send the + //coalesced requests to the TLB + if (!coalescer->probeTLBEvent.scheduled()) { + coalescer->schedule(coalescer->probeTLBEvent, + curTick() + coalescer->ticks(1)); + } + + return true; +} + +void +TLBCoalescer::CpuSidePort::recvReqRetry() +{ + assert(false); +} + +void +TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt) +{ + + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast(pkt->senderState); + + bool update_stats = !sender_state->prefetch; + + if (update_stats) + coalescer->uncoalescedAccesses++; + + // If there is a pending timing request for this virtual address + // print a warning message. This is a temporary caveat of + // the current simulator where atomic and timing requests can + // coexist. FIXME remove this check/warning in the future. + Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); + int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr); + + if (map_count) { + DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing " + "req. pending\n", virt_page_addr); + } + + coalescer->memSidePort[0]->sendFunctional(pkt); +} + +AddrRangeList +TLBCoalescer::CpuSidePort::getAddrRanges() const +{ + // currently not checked by the master + AddrRangeList ranges; + + return ranges; +} + +bool +TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt) +{ + // a translation completed and returned + coalescer->updatePhysAddresses(pkt); + + return true; +} + +void +TLBCoalescer::MemSidePort::recvReqRetry() +{ + //we've receeived a retry. Schedule a probeTLBEvent + if (!coalescer->probeTLBEvent.scheduled()) + coalescer->schedule(coalescer->probeTLBEvent, + curTick() + coalescer->ticks(1)); +} + +void +TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt) +{ + fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n"); +} + +TLBCoalescer::IssueProbeEvent::IssueProbeEvent(TLBCoalescer * _coalescer) + : Event(CPU_Tick_Pri), coalescer(_coalescer) +{ +} + +const char* +TLBCoalescer::IssueProbeEvent::description() const +{ + return "Probe the TLB below"; +} + +/* + * Here we scan the coalescer FIFO and issue the max + * number of permitted probes to the TLB below. We + * permit bypassing of coalesced requests for the same + * tick_index. + * + * We do not access the next tick_index unless we've + * drained the previous one. The coalesced requests + * that are successfully sent are moved to the + * issuedTranslationsTable table (the table which keeps + * track of the outstanding reqs) + */ +void +TLBCoalescer::IssueProbeEvent::process() +{ + // number of TLB probes sent so far + int sent_probes = 0; + // rejected denotes a blocking event + bool rejected = false; + + // It is set to true either when the recvTiming of the TLB below + // returns false or when there is another outstanding request for the + // same virt. page. + + DPRINTF(GPUTLB, "triggered TLBCoalescer IssueProbeEvent\n"); + + for (auto iter = coalescer->coalescerFIFO.begin(); + iter != coalescer->coalescerFIFO.end() && !rejected; ) { + int coalescedReq_cnt = iter->second.size(); + int i = 0; + int vector_index = 0; + + DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n", + coalescedReq_cnt, iter->first); + + while (i < coalescedReq_cnt) { + ++i; + PacketPtr first_packet = iter->second[vector_index][0]; + + // compute virtual page address for this request + Addr virt_page_addr = roundDown(first_packet->req->getVaddr(), + TheISA::PageBytes); + + // is there another outstanding request for the same page addr? + int pending_reqs = + coalescer->issuedTranslationsTable.count(virt_page_addr); + + if (pending_reqs) { + DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for " + "page %#x\n", virt_page_addr); + + ++vector_index; + rejected = true; + + continue; + } + + // send the coalesced request for virt_page_addr + if (!coalescer->memSidePort[0]->sendTimingReq(first_packet)) { + DPRINTF(GPUTLB, "Failed to send TLB request for page %#x", + virt_page_addr); + + // No need for a retries queue since we are already buffering + // the coalesced request in coalescerFIFO. + rejected = true; + ++vector_index; + } else { + TheISA::GpuTLB::TranslationState *tmp_sender_state = + safe_cast + (first_packet->senderState); + + bool update_stats = !tmp_sender_state->prefetch; + + if (update_stats) { + // req_cnt is total number of packets represented + // by the one we just sent counting all the way from + // the top of TLB hiearchy (i.e., from the CU) + int req_cnt = tmp_sender_state->reqCnt.back(); + coalescer->queuingCycles += (curTick() * req_cnt); + + DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n", + coalescer->name(), req_cnt); + + // pkt_cnt is number of packets we coalesced into the one + // we just sent but only at this coalescer level + int pkt_cnt = iter->second[vector_index].size(); + coalescer->localqueuingCycles += (curTick() * pkt_cnt); + } + + DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x", + virt_page_addr); + + //copy coalescedReq to issuedTranslationsTable + coalescer->issuedTranslationsTable[virt_page_addr] + = iter->second[vector_index]; + + //erase the entry of this coalesced req + iter->second.erase(iter->second.begin() + vector_index); + + if (iter->second.empty()) + assert(i == coalescedReq_cnt); + + sent_probes++; + if (sent_probes == coalescer->TLBProbesPerCycle) + return; + } + } + + //if there are no more coalesced reqs for this tick_index + //erase the hash_map with the first iterator + if (iter->second.empty()) { + coalescer->coalescerFIFO.erase(iter++); + } else { + ++iter; + } + } +} + +TLBCoalescer::CleanupEvent::CleanupEvent(TLBCoalescer* _coalescer) + : Event(Maximum_Pri), coalescer(_coalescer) +{ +} + +const char* +TLBCoalescer::CleanupEvent::description() const +{ + return "Cleanup issuedTranslationsTable hashmap"; +} + +void +TLBCoalescer::CleanupEvent::process() +{ + while (!coalescer->cleanupQueue.empty()) { + Addr cleanup_addr = coalescer->cleanupQueue.front(); + coalescer->cleanupQueue.pop(); + coalescer->issuedTranslationsTable.erase(cleanup_addr); + + DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n", + cleanup_addr); + } +} + +void +TLBCoalescer::regStats() +{ + uncoalescedAccesses + .name(name() + ".uncoalesced_accesses") + .desc("Number of uncoalesced TLB accesses") + ; + + coalescedAccesses + .name(name() + ".coalesced_accesses") + .desc("Number of coalesced TLB accesses") + ; + + queuingCycles + .name(name() + ".queuing_cycles") + .desc("Number of cycles spent in queue") + ; + + localqueuingCycles + .name(name() + ".local_queuing_cycles") + .desc("Number of cycles spent in queue for all incoming reqs") + ; + + localLatency + .name(name() + ".local_latency") + .desc("Avg. latency over all incoming pkts") + ; + + localLatency = localqueuingCycles / uncoalescedAccesses; +} + + +TLBCoalescer* +TLBCoalescerParams::create() +{ + return new TLBCoalescer(this); +} + diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/gpu-compute/tlb_coalescer.hh new file mode 100644 index 000000000..09210148b --- /dev/null +++ b/src/gpu-compute/tlb_coalescer.hh @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#ifndef __TLB_COALESCER_HH__ +#define __TLB_COALESCER_HH__ + +#include +#include +#include +#include + +#include "arch/generic/tlb.hh" +#include "arch/isa.hh" +#include "arch/isa_traits.hh" +#include "arch/x86/pagetable.hh" +#include "arch/x86/regs/segment.hh" +#include "base/misc.hh" +#include "base/statistics.hh" +#include "gpu-compute/gpu_tlb.hh" +#include "mem/mem_object.hh" +#include "mem/port.hh" +#include "mem/request.hh" +#include "params/TLBCoalescer.hh" + +class BaseTLB; +class Packet; +class ThreadContext; + +/** + * The TLBCoalescer is a MemObject sitting on the front side (CPUSide) of + * each TLB. It receives packets and issues coalesced requests to the + * TLB below it. It controls how requests are coalesced (the rules) + * and the permitted number of TLB probes per cycle (i.e., how many + * coalesced requests it feeds the TLB per cycle). + */ +class TLBCoalescer : public MemObject +{ + protected: + // TLB clock: will inherit clock from shader's clock period in terms + // of nuber of ticks of curTime (aka global simulation clock) + // The assignment of TLB clock from shader clock is done in the + // python config files. + int clock; + + public: + typedef TLBCoalescerParams Params; + TLBCoalescer(const Params *p); + ~TLBCoalescer() { } + + // Number of TLB probes per cycle. Parameterizable - default 2. + int TLBProbesPerCycle; + + // Consider coalescing across that many ticks. + // Paraemterizable - default 1. + int coalescingWindow; + + // Each coalesced request consists of multiple packets + // that all fall within the same virtual page + typedef std::vector coalescedReq; + + // disables coalescing when true + bool disableCoalescing; + + /* + * This is a hash map with as a key. + * It contains a vector of coalescedReqs per . + * Requests are buffered here until they can be issued to + * the TLB, at which point they are copied to the + * issuedTranslationsTable hash map. + * + * In terms of coalescing, we coalesce requests in a given + * window of x cycles by using tick_index = issueTime/x as a + * key, where x = coalescingWindow. issueTime is the issueTime + * of the pkt from the ComputeUnit's perspective, but another + * option is to change it to curTick(), so we coalesce based + * on the receive time. + */ + typedef std::unordered_map> CoalescingFIFO; + + CoalescingFIFO coalescerFIFO; + + /* + * issuedTranslationsTabler: a hash_map indexed by virtual page + * address. Each hash_map entry has a vector of PacketPtr associated + * with it denoting the different packets that share an outstanding + * coalesced translation request for the same virtual page. + * + * The rules that determine which requests we can coalesce are + * specified in the canCoalesce() method. + */ + typedef std::unordered_map CoalescingTable; + + CoalescingTable issuedTranslationsTable; + + // number of packets the coalescer receives + Stats::Scalar uncoalescedAccesses; + // number packets the coalescer send to the TLB + Stats::Scalar coalescedAccesses; + + // Number of cycles the coalesced requests spend waiting in + // coalescerFIFO. For each packet the coalescer receives we take into + // account the number of all uncoalesced requests this pkt "represents" + Stats::Scalar queuingCycles; + + // On average how much time a request from the + // uncoalescedAccesses that reaches the TLB + // spends waiting? + Stats::Scalar localqueuingCycles; + // localqueuingCycles/uncoalescedAccesses + Stats::Formula localLatency; + + bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2); + void updatePhysAddresses(PacketPtr pkt); + void regStats(); + + // Clock related functions. Maps to-and-from + // Simulation ticks and object clocks. + Tick frequency() const { return SimClock::Frequency / clock; } + Tick ticks(int numCycles) const { return (Tick)clock * numCycles; } + Tick curCycle() const { return curTick() / clock; } + Tick tickToCycles(Tick val) const { return val / clock;} + + class CpuSidePort : public SlavePort + { + public: + CpuSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer, + PortID _index) + : SlavePort(_name, tlb_coalescer), coalescer(tlb_coalescer), + index(_index) { } + + protected: + TLBCoalescer *coalescer; + int index; + + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + + virtual void + recvRespRetry() + { + fatal("recvRespRetry() is not implemented in the TLB coalescer.\n"); + } + + virtual AddrRangeList getAddrRanges() const; + }; + + class MemSidePort : public MasterPort + { + public: + MemSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer, + PortID _index) + : MasterPort(_name, tlb_coalescer), coalescer(tlb_coalescer), + index(_index) { } + + std::deque retries; + + protected: + TLBCoalescer *coalescer; + int index; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + + virtual void + recvRespRetry() + { + fatal("recvRespRetry() not implemented in TLB coalescer"); + } + }; + + // Coalescer slave ports on the cpu Side + std::vector cpuSidePort; + // Coalescer master ports on the memory side + std::vector memSidePort; + + BaseMasterPort& getMasterPort(const std::string &if_name, PortID idx); + BaseSlavePort& getSlavePort(const std::string &if_name, PortID idx); + + class IssueProbeEvent : public Event + { + private: + TLBCoalescer *coalescer; + + public: + IssueProbeEvent(TLBCoalescer *_coalescer); + void process(); + const char *description() const; + }; + + // this event issues the TLB probes + IssueProbeEvent probeTLBEvent; + + // the cleanupEvent is scheduled after a TLBEvent triggers + // in order to free memory and do the required clean-up + class CleanupEvent : public Event + { + private: + TLBCoalescer *coalescer; + + public: + CleanupEvent(TLBCoalescer *_coalescer); + void process(); + const char* description() const; + }; + + // schedule cleanup + CleanupEvent cleanupEvent; + + // this FIFO queue keeps track of the virt. page + // addresses that are pending cleanup + std::queue cleanupQueue; +}; + +#endif // __TLB_COALESCER_HH__ diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc new file mode 100644 index 000000000..8b7dc0691 --- /dev/null +++ b/src/gpu-compute/vector_register_file.cc @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#include "gpu-compute/vector_register_file.hh" + +#include + +#include "base/misc.hh" +#include "gpu-compute/code_enums.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/simple_pool_manager.hh" +#include "gpu-compute/wavefront.hh" +#include "params/VectorRegisterFile.hh" + +VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p) + : SimObject(p), + manager(new SimplePoolManager(p->min_alloc, p->num_regs_per_simd)), + simdId(p->simd_id), numRegsPerSimd(p->num_regs_per_simd), + vgprState(new VecRegisterState()) +{ + fatal_if(numRegsPerSimd % 2, "VRF size is illegal\n"); + fatal_if(simdId < 0, "Illegal SIMD id for VRF"); + + fatal_if(numRegsPerSimd % p->min_alloc, "Min VGPR region allocation is not " + "multiple of VRF size\n"); + + busy.clear(); + busy.resize(numRegsPerSimd, 0); + nxtBusy.clear(); + nxtBusy.resize(numRegsPerSimd, 0); + + vgprState->init(numRegsPerSimd); +} + +void +VectorRegisterFile::setParent(ComputeUnit *_computeUnit) +{ + computeUnit = _computeUnit; + vgprState->setParent(computeUnit); +} + +uint8_t +VectorRegisterFile::regNxtBusy(int idx, uint32_t operandSize) const +{ + uint8_t status = nxtBusy.at(idx); + + if (operandSize > 4) { + status = status | (nxtBusy.at((idx + 1) % numRegs())); + } + + return status; +} + +uint8_t +VectorRegisterFile::regBusy(int idx, uint32_t operandSize) const +{ + uint8_t status = busy.at(idx); + + if (operandSize > 4) { + status = status | (busy.at((idx + 1) % numRegs())); + } + + return status; +} + +void +VectorRegisterFile::preMarkReg(int regIdx, uint32_t operandSize, uint8_t value) +{ + nxtBusy.at(regIdx) = value; + + if (operandSize > 4) { + nxtBusy.at((regIdx + 1) % numRegs()) = value; + } +} + +void +VectorRegisterFile::markReg(int regIdx, uint32_t operandSize, uint8_t value) +{ + busy.at(regIdx) = value; + + if (operandSize > 4) { + busy.at((regIdx + 1) % numRegs()) = value; + } +} + +bool +VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const +{ + for (int i = 0; i < ii->getNumOperands(); ++i) { + if (ii->isVectorRegister(i)) { + uint32_t vgprIdx = ii->getRegisterIndex(i); + uint32_t pVgpr = w->remap(vgprIdx, ii->getOperandSize(i), 1); + + if (regBusy(pVgpr, ii->getOperandSize(i)) == 1) { + if (ii->isDstOperand(i)) { + w->numTimesBlockedDueWAXDependencies++; + } else if (ii->isSrcOperand(i)) { + w->numTimesBlockedDueRAWDependencies++; + } + + return false; + } + + if (regNxtBusy(pVgpr, ii->getOperandSize(i)) == 1) { + if (ii->isDstOperand(i)) { + w->numTimesBlockedDueWAXDependencies++; + } else if (ii->isSrcOperand(i)) { + w->numTimesBlockedDueRAWDependencies++; + } + + return false; + } + } + } + + return true; +} + +void +VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w) +{ + bool loadInstr = IS_OT_READ(ii->opType()); + bool atomicInstr = IS_OT_ATOMIC(ii->opType()); + + bool loadNoArgInstr = loadInstr && !ii->isArgLoad(); + + // iterate over all register destination operands + for (int i = 0; i < ii->getNumOperands(); ++i) { + if (ii->isVectorRegister(i) && ii->isDstOperand(i)) { + uint32_t physReg = w->remap(ii->getRegisterIndex(i), + ii->getOperandSize(i), 1); + + // mark the destination vector register as busy + markReg(physReg, ii->getOperandSize(i), 1); + // clear the in-flight status of the destination vector register + preMarkReg(physReg, ii->getOperandSize(i), 0); + + // FIXME: if we ever model correct timing behavior + // for load argument instructions then we should not + // set the destination register as busy now but when + // the data returns. Loads and Atomics should free + // their destination registers when the data returns, + // not now + if (!atomicInstr && !loadNoArgInstr) { + uint32_t pipeLen = ii->getOperandSize(i) <= 4 ? + computeUnit->spBypassLength() : + computeUnit->dpBypassLength(); + + // schedule an event for marking the register as ready + computeUnit->registerEvent(w->simdId, physReg, + ii->getOperandSize(i), + computeUnit->shader->tick_cnt + + computeUnit->shader->ticks(pipeLen), + 0); + } + } + } +} + +int +VectorRegisterFile::exec(uint64_t dynamic_id, Wavefront *w, + std::vector ®Vec, uint32_t operandSize, + uint64_t timestamp) +{ + int delay = 0; + + panic_if(regVec.size() <= 0, "Illegal VGPR vector size=%d\n", + regVec.size()); + + for (int i = 0; i < regVec.size(); ++i) { + // mark the destination VGPR as free when the timestamp expires + computeUnit->registerEvent(w->simdId, regVec[i], operandSize, + computeUnit->shader->tick_cnt + timestamp + + computeUnit->shader->ticks(delay), 0); + } + + return delay; +} + +void +VectorRegisterFile::updateResources(Wavefront *w, GPUDynInstPtr ii) +{ + // iterate over all register destination operands + for (int i = 0; i < ii->getNumOperands(); ++i) { + if (ii->isVectorRegister(i) && ii->isDstOperand(i)) { + uint32_t physReg = w->remap(ii->getRegisterIndex(i), + ii->getOperandSize(i), 1); + // set the in-flight status of the destination vector register + preMarkReg(physReg, ii->getOperandSize(i), 1); + } + } +} + +bool +VectorRegisterFile::vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w, + GPUDynInstPtr ii, + VrfAccessType accessType) +{ + bool ready = true; + + return ready; +} + +bool +VectorRegisterFile::vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii, + VrfAccessType accessType) +{ + bool ready = true; + + return ready; +} + +VectorRegisterFile* +VectorRegisterFileParams::create() +{ + return new VectorRegisterFile(this); +} diff --git a/src/gpu-compute/vector_register_file.hh b/src/gpu-compute/vector_register_file.hh new file mode 100644 index 000000000..1cb011a1e --- /dev/null +++ b/src/gpu-compute/vector_register_file.hh @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#ifndef __VECTOR_REGISTER_FILE_HH__ +#define __VECTOR_REGISTER_FILE_HH__ + +#include + +#include "base/statistics.hh" +#include "base/types.hh" +#include "gpu-compute/vector_register_state.hh" +#include "sim/sim_object.hh" + +class ComputeUnit; +class Shader; +class SimplePoolManager; +class Wavefront; + +struct VectorRegisterFileParams; + +enum class VrfAccessType : uint8_t +{ + READ = 0x01, + WRITE = 0x02, + RD_WR = READ | WRITE +}; + +// Vector Register File +class VectorRegisterFile : public SimObject +{ + public: + VectorRegisterFile(const VectorRegisterFileParams *p); + + void setParent(ComputeUnit *_computeUnit); + + // Read a register + template + T + read(int regIdx, int threadId=0) + { + T p0 = vgprState->read(regIdx, threadId); + + return p0; + } + + // Write a register + template + void + write(int regIdx, T value, int threadId=0) + { + vgprState->write(regIdx, value, threadId); + } + + uint8_t regBusy(int idx, uint32_t operandSize) const; + uint8_t regNxtBusy(int idx, uint32_t operandSize) const; + + int numRegs() const { return numRegsPerSimd; } + + void markReg(int regIdx, uint32_t operandSize, uint8_t value); + void preMarkReg(int regIdx, uint32_t operandSize, uint8_t value); + + virtual void exec(GPUDynInstPtr ii, Wavefront *w); + + virtual int exec(uint64_t dynamic_id, Wavefront *w, + std::vector ®Vec, uint32_t operandSize, + uint64_t timestamp); + + bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const; + virtual void updateEvents() { } + virtual void updateResources(Wavefront *w, GPUDynInstPtr ii); + + virtual bool + isReadConflict(int memWfId, int exeWfId) const + { + return false; + } + + virtual bool + isWriteConflict(int memWfId, int exeWfId) const + { + return false; + } + + virtual bool vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w, + GPUDynInstPtr ii, + VrfAccessType accessType); + + virtual bool vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii, + VrfAccessType accessType); + + SimplePoolManager *manager; + + protected: + ComputeUnit* computeUnit; + int simdId; + + // flag indicating if a register is busy + std::vector busy; + // flag indicating if a register will be busy (by instructions + // in the SIMD pipeline) + std::vector nxtBusy; + + // numer of registers (bank size) per simd unit (bank) + int numRegsPerSimd; + + // vector register state + VecRegisterState *vgprState; +}; + +#endif // __VECTOR_REGISTER_FILE_HH__ diff --git a/src/gpu-compute/vector_register_state.cc b/src/gpu-compute/vector_register_state.cc new file mode 100644 index 000000000..f231b0579 --- /dev/null +++ b/src/gpu-compute/vector_register_state.cc @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#include "gpu-compute/vector_register_state.hh" + +#include "gpu-compute/compute_unit.hh" + +VecRegisterState::VecRegisterState() : computeUnit(nullptr) +{ + s_reg.clear(); + d_reg.clear(); +} + +void +VecRegisterState::setParent(ComputeUnit *_computeUnit) +{ + computeUnit = _computeUnit; + _name = computeUnit->name() + ".VecRegState"; +} + +void +VecRegisterState::init(uint32_t _size) +{ + s_reg.resize(_size); + d_reg.resize(_size); +} diff --git a/src/gpu-compute/vector_register_state.hh b/src/gpu-compute/vector_register_state.hh new file mode 100644 index 000000000..a233b9acc --- /dev/null +++ b/src/gpu-compute/vector_register_state.hh @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#ifndef __VECTOR_REGISTER_STATE_HH__ +#define __VECTOR_REGISTER_STATE_HH__ + +#include +#include +#include +#include + +#include "gpu-compute/misc.hh" + +class ComputeUnit; + +// Vector Register State per SIMD unit (contents of the vector +// registers in the VRF of the SIMD) +class VecRegisterState +{ + public: + VecRegisterState(); + void init(uint32_t _size); + + const std::string& name() const { return _name; } + void setParent(ComputeUnit *_computeUnit); + void regStats() { } + + // Access methods + template + T + read(int regIdx, int threadId=0) { + T *p0; + assert(sizeof(T) == 4 || sizeof(T) == 8); + if (sizeof(T) == 4) { + p0 = (T*)(&s_reg[regIdx][threadId]); + } else { + p0 = (T*)(&d_reg[regIdx][threadId]); + } + + return *p0; + } + + template + void + write(unsigned int regIdx, T value, int threadId=0) { + T *p0; + assert(sizeof(T) == 4 || sizeof(T) == 8); + if (sizeof(T) == 4) { + p0 = (T*)(&s_reg[regIdx][threadId]); + } else { + p0 = (T*)(&d_reg[regIdx][threadId]); + } + + *p0 = value; + } + + // (Single Precision) Vector Register File size. + int regSize() { return s_reg.size(); } + + private: + ComputeUnit *computeUnit; + std::string _name; + // 32-bit Single Precision Vector Register State + std::vector> s_reg; + // 64-bit Double Precision Vector Register State + std::vector> d_reg; +}; + +#endif // __VECTOR_REGISTER_STATE_HH__ diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc new file mode 100644 index 000000000..0aa033db1 --- /dev/null +++ b/src/gpu-compute/wavefront.cc @@ -0,0 +1,925 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#include "gpu-compute/wavefront.hh" + +#include "debug/GPUExec.hh" +#include "debug/WavefrontStack.hh" +#include "gpu-compute/code_enums.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/vector_register_file.hh" + +Wavefront* +WavefrontParams::create() +{ + return new Wavefront(this); +} + +Wavefront::Wavefront(const Params *p) + : SimObject(p), callArgMem(nullptr) +{ + last_trace = 0; + simdId = p->simdId; + wfSlotId = p->wf_slot_id; + + status = S_STOPPED; + reservedVectorRegs = 0; + startVgprIndex = 0; + outstanding_reqs = 0; + mem_reqs_in_pipe = 0; + outstanding_reqs_wr_gm = 0; + outstanding_reqs_wr_lm = 0; + outstanding_reqs_rd_gm = 0; + outstanding_reqs_rd_lm = 0; + rd_lm_reqs_in_pipe = 0; + rd_gm_reqs_in_pipe = 0; + wr_lm_reqs_in_pipe = 0; + wr_gm_reqs_in_pipe = 0; + + barrier_cnt = 0; + old_barrier_cnt = 0; + stalledAtBarrier = false; + + mem_trace_busy = 0; + old_vgpr_tcnt = 0xffffffffffffffffll; + old_dgpr_tcnt = 0xffffffffffffffffll; + + pendingFetch = false; + dropFetch = false; + condRegState = new ConditionRegisterState(); + maxSpVgprs = 0; + maxDpVgprs = 0; +} + +void +Wavefront::regStats() +{ + srcRegOpDist + .init(0, 4, 2) + .name(name() + ".src_reg_operand_dist") + .desc("number of executed instructions with N source register operands") + ; + + dstRegOpDist + .init(0, 3, 2) + .name(name() + ".dst_reg_operand_dist") + .desc("number of executed instructions with N destination register " + "operands") + ; + + // FIXME: the name of the WF needs to be unique + numTimesBlockedDueWAXDependencies + .name(name() + ".timesBlockedDueWAXDependencies") + .desc("number of times the wf's instructions are blocked due to WAW " + "or WAR dependencies") + ; + + // FIXME: the name of the WF needs to be unique + numTimesBlockedDueRAWDependencies + .name(name() + ".timesBlockedDueRAWDependencies") + .desc("number of times the wf's instructions are blocked due to RAW " + "dependencies") + ; + + // FIXME: the name of the WF needs to be unique + numTimesBlockedDueVrfPortAvail + .name(name() + ".timesBlockedDueVrfPortAvail") + .desc("number of times instructions are blocked due to VRF port " + "availability") + ; +} + +void +Wavefront::init() +{ + reservedVectorRegs = 0; + startVgprIndex = 0; +} + +void +Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs) +{ + condRegState->init(num_cregs); + maxSpVgprs = num_sregs; + maxDpVgprs = num_dregs; +} + +Wavefront::~Wavefront() +{ + if (callArgMem) + delete callArgMem; +} + +void +Wavefront::start(uint64_t _wfDynId,uint64_t _base_ptr) +{ + wfDynId = _wfDynId; + base_ptr = _base_ptr; + status = S_RUNNING; +} + +bool +Wavefront::isGmInstruction(GPUDynInstPtr ii) +{ + if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || + IS_OT_ATOMIC_PM(ii->opType())) { + return true; + } + + if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || + IS_OT_ATOMIC_GM(ii->opType())) { + + return true; + } + + if (IS_OT_FLAT(ii->opType())) { + return true; + } + + return false; +} + +bool +Wavefront::isLmInstruction(GPUDynInstPtr ii) +{ + if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) || + IS_OT_ATOMIC_LM(ii->opType())) { + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstALU() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP || + ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || + ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || + ii->opType() == Enums::OT_KERN_READ)) { + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstBarrier() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) { + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstGMem() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) || + IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { + + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstLMem() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) || + IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { + + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstPrivMem() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) || + IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { + + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstFlatMem() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) { + + return true; + } + + return false; +} + +// Return true if the Wavefront's instruction +// buffer has branch instruction. +bool +Wavefront::instructionBufferHasBranch() +{ + for (auto it : instructionBuffer) { + GPUDynInstPtr ii = it; + + if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) { + return true; + } + } + + return false; +} + +// Remap HSAIL register to physical VGPR. +// HSAIL register = virtual register assigned to an operand by HLC compiler +uint32_t +Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode) +{ + assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0)); + // add the offset from where the VGPRs of the wavefront have been assigned + uint32_t physicalVgprIndex = startVgprIndex + vgprIndex; + // HSAIL double precision (DP) register: calculate the physical VGPR index + // assuming that DP registers are placed after SP ones in the VRF. The DP + // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust + // the DP VGPR index before mapping it to the physical VRF address space + if (mode == 1 && size > 4) { + physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex); + } + + assert((startVgprIndex <= physicalVgprIndex) && + (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex); + + // calculate absolute physical VGPR index + return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs(); +} + +// Return true if this wavefront is ready +// to execute an instruction of the specified type. +int +Wavefront::ready(itype_e type) +{ + // Check to make sure wave is running + if (status == S_STOPPED || status == S_RETURNING || + instructionBuffer.empty()) { + return 0; + } + + // Is the wave waiting at a barrier + if (stalledAtBarrier) { + if (!computeUnit->AllAtBarrier(barrier_id,barrier_cnt, + computeUnit->getRefCounter(dispatchid, wg_id))) { + // Are all threads at barrier? + return 0; + } + old_barrier_cnt = barrier_cnt; + stalledAtBarrier = false; + } + + // Read instruction + GPUDynInstPtr ii = instructionBuffer.front(); + + bool ready_inst M5_VAR_USED = false; + bool glbMemBusRdy = false; + bool glbMemIssueRdy = false; + if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) { + for (int j=0; j < computeUnit->numGlbMemUnits; ++j) { + if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy()) + glbMemBusRdy = true; + if (computeUnit->wfWait[j].prerdy()) + glbMemIssueRdy = true; + } + } + bool locMemBusRdy = false; + bool locMemIssueRdy = false; + if (type == I_SHARED) { + for (int j=0; j < computeUnit->numLocMemUnits; ++j) { + if (computeUnit->vrfToLocalMemPipeBus[j].prerdy()) + locMemBusRdy = true; + if (computeUnit->wfWait[j].prerdy()) + locMemIssueRdy = true; + } + } + + // The following code is very error prone and the entire process for + // checking readiness will be fixed eventually. In the meantime, let's + // make sure that we do not silently let an instruction type slip + // through this logic and always return not ready. + if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP || + ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || + ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || + ii->opType() == Enums::OT_KERN_READ || + ii->opType() == Enums::OT_ARG || + IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || + IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) || + IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || + IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || + IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) { + panic("next instruction: %s is of unknown type\n", ii->disassemble()); + } + + DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n", + computeUnit->cu_id, simdId, wfSlotId, ii->disassemble()); + + if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) { + // Here for ALU instruction (barrier) + if (!computeUnit->wfWait[simdId].prerdy()) { + // Is wave slot free? + return 0; + } + + // Are there in pipe or outstanding memory requests? + if ((outstanding_reqs + mem_reqs_in_pipe) > 0) { + return 0; + } + + ready_inst = true; + } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) { + // Here for ALU instruction (nop) + if (!computeUnit->wfWait[simdId].prerdy()) { + // Is wave slot free? + return 0; + } + + ready_inst = true; + } else if (type == I_ALU && ii->opType() == Enums::OT_RET) { + // Here for ALU instruction (return) + if (!computeUnit->wfWait[simdId].prerdy()) { + // Is wave slot free? + return 0; + } + + // Are there in pipe or outstanding memory requests? + if ((outstanding_reqs + mem_reqs_in_pipe) > 0) { + return 0; + } + + ready_inst = true; + } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH || + ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || + ii->opType() == Enums::OT_KERN_READ || + ii->opType() == Enums::OT_ARG)) { + // Here for ALU instruction (all others) + if (!computeUnit->wfWait[simdId].prerdy()) { + // Is alu slot free? + return 0; + } + if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, + VrfAccessType::RD_WR)) { + return 0; + } + + if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { + return 0; + } + ready_inst = true; + } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) || + IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { + // Here Global memory instruction + if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) { + // Are there in pipe or outstanding global memory write requests? + if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) { + return 0; + } + } + + if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) || + IS_OT_HIST_GM(ii->opType())) { + // Are there in pipe or outstanding global memory read requests? + if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) + return 0; + } + + if (!glbMemIssueRdy) { + // Is WV issue slot free? + return 0; + } + + if (!glbMemBusRdy) { + // Is there an available VRF->Global memory read bus? + return 0; + } + + if (!computeUnit->globalMemoryPipe. + isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { + // Can we insert a new request to the Global Mem Request FIFO? + return 0; + } + // can we schedule source & destination operands on the VRF? + if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, + VrfAccessType::RD_WR)) { + return 0; + } + if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { + return 0; + } + ready_inst = true; + } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) || + IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { + // Here for Shared memory instruction + if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) { + if ((outstanding_reqs_wr_lm + wr_lm_reqs_in_pipe) > 0) { + return 0; + } + } + + if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || + IS_OT_HIST_LM(ii->opType())) { + if ((outstanding_reqs_rd_lm + rd_lm_reqs_in_pipe) > 0) { + return 0; + } + } + + if (!locMemBusRdy) { + // Is there an available VRF->LDS read bus? + return 0; + } + if (!locMemIssueRdy) { + // Is wave slot free? + return 0; + } + + if (!computeUnit->localMemoryPipe. + isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) { + // Can we insert a new request to the LDS Request FIFO? + return 0; + } + // can we schedule source & destination operands on the VRF? + if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, + VrfAccessType::RD_WR)) { + return 0; + } + if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { + return 0; + } + ready_inst = true; + } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) || + IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { + // Here for Private memory instruction ------------------------ // + if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) { + if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) { + return 0; + } + } + + if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) || + IS_OT_HIST_PM(ii->opType())) { + if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) { + return 0; + } + } + + if (!glbMemBusRdy) { + // Is there an available VRF->Global memory read bus? + return 0; + } + + if (!glbMemIssueRdy) { + // Is wave slot free? + return 0; + } + + if (!computeUnit->globalMemoryPipe. + isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { + // Can we insert a new request to the Global Mem Request FIFO? + return 0; + } + // can we schedule source & destination operands on the VRF? + if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, + VrfAccessType::RD_WR)) { + return 0; + } + if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { + return 0; + } + ready_inst = true; + } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) { + if (!glbMemBusRdy) { + // Is there an available VRF->Global memory read bus? + return 0; + } + + if (!locMemBusRdy) { + // Is there an available VRF->LDS read bus? + return 0; + } + + if (!glbMemIssueRdy) { + // Is wave slot free? + return 0; + } + + if (!locMemIssueRdy) { + return 0; + } + if (!computeUnit->globalMemoryPipe. + isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { + // Can we insert a new request to the Global Mem Request FIFO? + return 0; + } + + if (!computeUnit->localMemoryPipe. + isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) { + // Can we insert a new request to the LDS Request FIFO? + return 0; + } + // can we schedule source & destination operands on the VRF? + if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, + VrfAccessType::RD_WR)) { + return 0; + } + // are all the operands ready? (RAW, WAW and WAR depedencies met?) + if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { + return 0; + } + ready_inst = true; + } else { + return 0; + } + + assert(ready_inst); + + DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id, + simdId, wfSlotId, ii->disassemble()); + + return 1; +} + +void +Wavefront::updateResources() +{ + // Get current instruction + GPUDynInstPtr ii = instructionBuffer.front(); + assert(ii); + computeUnit->vrf[simdId]->updateResources(this, ii); + // Single precision ALU or Branch or Return or Special instruction + if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || + ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || + // FIXME: Kernel argument loads are currently treated as ALU operations + // since we don't send memory packets at execution. If we fix that then + // we should map them to one of the memory pipelines + ii->opType()==Enums::OT_KERN_READ || + ii->opType()==Enums::OT_ARG || + ii->opType()==Enums::OT_RET) { + computeUnit->aluPipe[simdId].preset(computeUnit->shader-> + ticks(computeUnit->spBypassLength())); + // this is to enforce a fixed number of cycles per issue slot per SIMD + computeUnit->wfWait[simdId].preset(computeUnit->shader-> + ticks(computeUnit->issuePeriod)); + } else if (ii->opType() == Enums::OT_BARRIER) { + computeUnit->wfWait[simdId].preset(computeUnit->shader-> + ticks(computeUnit->issuePeriod)); + } else if (ii->opType() == Enums::OT_FLAT_READ) { + assert(Enums::SC_NONE != ii->executedAs()); + mem_reqs_in_pipe++; + rd_gm_reqs_in_pipe++; + if ( Enums::SC_SHARED == ii->executedAs() ) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + preset(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } + } else if (ii->opType() == Enums::OT_FLAT_WRITE) { + assert(Enums::SC_NONE != ii->executedAs()); + mem_reqs_in_pipe++; + wr_gm_reqs_in_pipe++; + if (Enums::SC_SHARED == ii->executedAs()) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } + } else if (IS_OT_READ_GM(ii->opType())) { + mem_reqs_in_pipe++; + rd_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_WRITE_GM(ii->opType())) { + mem_reqs_in_pipe++; + wr_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_ATOMIC_GM(ii->opType())) { + mem_reqs_in_pipe++; + wr_gm_reqs_in_pipe++; + rd_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_READ_LM(ii->opType())) { + mem_reqs_in_pipe++; + rd_lm_reqs_in_pipe++; + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + preset(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_WRITE_LM(ii->opType())) { + mem_reqs_in_pipe++; + wr_lm_reqs_in_pipe++; + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_ATOMIC_LM(ii->opType())) { + mem_reqs_in_pipe++; + wr_lm_reqs_in_pipe++; + rd_lm_reqs_in_pipe++; + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_READ_PM(ii->opType())) { + mem_reqs_in_pipe++; + rd_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_WRITE_PM(ii->opType())) { + mem_reqs_in_pipe++; + wr_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_ATOMIC_PM(ii->opType())) { + mem_reqs_in_pipe++; + wr_gm_reqs_in_pipe++; + rd_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } +} + +void +Wavefront::exec() +{ + // ---- Exit if wavefront is inactive ----------------------------- // + + if (status == S_STOPPED || status == S_RETURNING || + instructionBuffer.empty()) { + return; + } + + // Get current instruction + + GPUDynInstPtr ii = instructionBuffer.front(); + + const uint32_t old_pc = pc(); + DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " + "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, + ii->disassemble(), old_pc); + ii->execute(); + // access the VRF + computeUnit->vrf[simdId]->exec(ii, this); + srcRegOpDist.sample(ii->numSrcRegOperands()); + dstRegOpDist.sample(ii->numDstRegOperands()); + computeUnit->numInstrExecuted++; + computeUnit->execRateDist.sample(computeUnit->totalCycles.value() - + computeUnit->lastExecCycle[simdId]); + computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value(); + if (pc() == old_pc) { + uint32_t new_pc = old_pc + 1; + // PC not modified by instruction, proceed to next or pop frame + pc(new_pc); + if (new_pc == rpc()) { + popFromReconvergenceStack(); + discardFetch(); + } else { + instructionBuffer.pop_front(); + } + } + + if (computeUnit->shader->hsail_mode==Shader::SIMT) { + const int num_active_lanes = execMask().count(); + computeUnit->controlFlowDivergenceDist.sample(num_active_lanes); + computeUnit->numVecOpsExecuted += num_active_lanes; + if (isGmInstruction(ii)) { + computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes); + } else if (isLmInstruction(ii)) { + computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes); + } + } + + // ---- Update Vector ALU pipeline and other resources ------------------ // + // Single precision ALU or Branch or Return or Special instruction + if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || + ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || + // FIXME: Kernel argument loads are currently treated as ALU operations + // since we don't send memory packets at execution. If we fix that then + // we should map them to one of the memory pipelines + ii->opType() == Enums::OT_KERN_READ || + ii->opType() == Enums::OT_ARG || + ii->opType() == Enums::OT_RET) { + computeUnit->aluPipe[simdId].set(computeUnit->shader-> + ticks(computeUnit->spBypassLength())); + + // this is to enforce a fixed number of cycles per issue slot per SIMD + computeUnit->wfWait[simdId].set(computeUnit->shader-> + ticks(computeUnit->issuePeriod)); + } else if (ii->opType() == Enums::OT_BARRIER) { + computeUnit->wfWait[simdId].set(computeUnit->shader-> + ticks(computeUnit->issuePeriod)); + } else if (ii->opType() == Enums::OT_FLAT_READ) { + assert(Enums::SC_NONE != ii->executedAs()); + + if (Enums::SC_SHARED == ii->executedAs()) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + set(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + set(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } + } else if (ii->opType() == Enums::OT_FLAT_WRITE) { + assert(Enums::SC_NONE != ii->executedAs()); + if (Enums::SC_SHARED == ii->executedAs()) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } + } else if (IS_OT_READ_GM(ii->opType())) { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + set(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_WRITE_GM(ii->opType())) { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_ATOMIC_GM(ii->opType())) { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_READ_LM(ii->opType())) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + set(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_WRITE_LM(ii->opType())) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_ATOMIC_LM(ii->opType())) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } +} + +bool +Wavefront::waitingAtBarrier(int lane) +{ + return bar_cnt[lane] < max_bar_cnt; +} + +void +Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc, + const VectorMask& mask) +{ + assert(mask.count()); + reconvergenceStack.emplace(new ReconvergenceStackEntry(pc, rpc, mask)); +} + +void +Wavefront::popFromReconvergenceStack() +{ + assert(!reconvergenceStack.empty()); + + DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ", + computeUnit->cu_id, simdId, wfSlotId, wfDynId, + execMask().to_string().c_str(), pc()); + + reconvergenceStack.pop(); + + DPRINTF(WavefrontStack, "%3i %s\n", pc(), + execMask().to_string().c_str()); + +} + +void +Wavefront::discardFetch() +{ + instructionBuffer.clear(); + dropFetch |=pendingFetch; +} + +uint32_t +Wavefront::pc() const +{ + return reconvergenceStack.top()->pc; +} + +uint32_t +Wavefront::rpc() const +{ + return reconvergenceStack.top()->rpc; +} + +VectorMask +Wavefront::execMask() const +{ + return reconvergenceStack.top()->execMask; +} + +bool +Wavefront::execMask(int lane) const +{ + return reconvergenceStack.top()->execMask[lane]; +} + + +void +Wavefront::pc(uint32_t new_pc) +{ + reconvergenceStack.top()->pc = new_pc; +} diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh new file mode 100644 index 000000000..0abab8e83 --- /dev/null +++ b/src/gpu-compute/wavefront.hh @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#ifndef __WAVEFRONT_HH__ +#define __WAVEFRONT_HH__ + +#include +#include +#include +#include +#include + +#include "base/misc.hh" +#include "base/types.hh" +#include "gpu-compute/condition_register_state.hh" +#include "gpu-compute/lds_state.hh" +#include "gpu-compute/misc.hh" +#include "params/Wavefront.hh" +#include "sim/sim_object.hh" + +static const int MAX_NUM_INSTS_PER_WF = 12; + +/* + * Arguments for the hsail opcode call, are user defined and variable length. + * The hardware/finalizer can support arguments in hardware or use memory to + * pass arguments. For now, let's assume that an unlimited number of arguments + * are supported in hardware (the compiler inlines functions whenver it can + * anyways, so unless someone is interested in the implications of linking/ + * library functions, I think this is a reasonable assumption given the typical + * size of an OpenCL kernel). + * + * Note that call args are different than kernel arguments: + * * All work-items in a kernel refer the same set of kernel arguments + * * Each work-item has it's on set of call args. So a call argument at + * address 0x4 is different for work-item 0 and work-item 1. + * + * Ok, the table below shows an example of how we organize the call arguments in + * the CallArgMem class. + * + * int foo(int arg1, double arg2) + * ___________________________________________________ + * | 0: return.0 | 4: return.1 | ... | 252: return.63 | + * |---------------------------------------------------| + * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 | + * |---------------------------------------------------| + * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 | + * ___________________________________________________ + */ +class CallArgMem +{ + public: + // pointer to buffer for storing function arguments + uint8_t *mem; + // size of function args + int funcArgsSizePerItem; + + template + int + getLaneOffset(int lane, int addr) + { + return addr * VSZ + sizeof(CType) * lane; + } + + CallArgMem(int func_args_size_per_item) + : funcArgsSizePerItem(func_args_size_per_item) + { + mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ); + } + + ~CallArgMem() + { + free(mem); + } + + template + uint8_t* + getLaneAddr(int lane, int addr) + { + return mem + getLaneOffset(lane, addr); + } + + template + void + setLaneAddr(int lane, int addr, CType val) + { + *((CType*)(mem + getLaneOffset(lane, addr))) = val; + } +}; + +/** + * A reconvergence stack entry conveys the necessary state to implement + * control flow divergence. + */ +class ReconvergenceStackEntry { + + public: + ReconvergenceStackEntry(uint32_t new_pc, uint32_t new_rpc, + VectorMask new_mask) : pc(new_pc), rpc(new_rpc), + execMask(new_mask) { + } + + /** + * PC of current instruction. + */ + uint32_t pc; + /** + * PC of the immediate post-dominator instruction, i.e., the value of + * @a pc for the first instruction that will be executed by the wavefront + * when a reconvergence point is reached. + */ + uint32_t rpc; + /** + * Execution mask. + */ + VectorMask execMask; +}; + +class Wavefront : public SimObject +{ + public: + enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE}; + enum status_e {S_STOPPED,S_RETURNING,S_RUNNING}; + + // Base pointer for array of instruction pointers + uint64_t base_ptr; + + uint32_t old_barrier_cnt; + uint32_t barrier_cnt; + uint32_t barrier_id; + uint32_t barrier_slots; + status_e status; + // HW slot id where the WF is mapped to inside a SIMD unit + int wfSlotId; + int kern_id; + // SIMD unit where the WV has been scheduled + int simdId; + // pointer to parent CU + ComputeUnit *computeUnit; + + std::deque instructionBuffer; + + bool pendingFetch; + bool dropFetch; + + // Condition Register State (for HSAIL simulations only) + class ConditionRegisterState *condRegState; + // number of single precision VGPRs required by WF + uint32_t maxSpVgprs; + // number of double precision VGPRs required by WF + uint32_t maxDpVgprs; + // map virtual to physical vector register + uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0); + void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); + bool isGmInstruction(GPUDynInstPtr ii); + bool isLmInstruction(GPUDynInstPtr ii); + bool isOldestInstGMem(); + bool isOldestInstLMem(); + bool isOldestInstPrivMem(); + bool isOldestInstFlatMem(); + bool isOldestInstALU(); + bool isOldestInstBarrier(); + // used for passing spill address to DDInstGPU + uint64_t last_addr[VSZ]; + uint32_t workitemid[3][VSZ]; + uint32_t workitemFlatId[VSZ]; + uint32_t workgroupid[3]; + uint32_t workgroupsz[3]; + uint32_t gridsz[3]; + uint32_t wg_id; + uint32_t wg_sz; + uint32_t dynwaveid; + uint32_t maxdynwaveid; + uint32_t dispatchid; + // outstanding global+local memory requests + uint32_t outstanding_reqs; + // memory requests between scoreboard + // and execute stage not yet executed + uint32_t mem_reqs_in_pipe; + // outstanding global memory write requests + uint32_t outstanding_reqs_wr_gm; + // outstanding local memory write requests + uint32_t outstanding_reqs_wr_lm; + // outstanding global memory read requests + uint32_t outstanding_reqs_rd_gm; + // outstanding local memory read requests + uint32_t outstanding_reqs_rd_lm; + uint32_t rd_lm_reqs_in_pipe; + uint32_t rd_gm_reqs_in_pipe; + uint32_t wr_lm_reqs_in_pipe; + uint32_t wr_gm_reqs_in_pipe; + + int mem_trace_busy; + uint64_t last_trace; + // number of vector registers reserved by WF + int reservedVectorRegs; + // Index into the Vector Register File's namespace where the WF's registers + // will live while the WF is executed + uint32_t startVgprIndex; + + // Old value of destination gpr (for trace) + uint32_t old_vgpr[VSZ]; + // Id of destination gpr (for trace) + uint32_t old_vgpr_id; + // Tick count of last old_vgpr copy + uint64_t old_vgpr_tcnt; + + // Old value of destination gpr (for trace) + uint64_t old_dgpr[VSZ]; + // Id of destination gpr (for trace) + uint32_t old_dgpr_id; + // Tick count of last old_vgpr copy + uint64_t old_dgpr_tcnt; + + // Execution mask at wavefront start + VectorMask init_mask; + + // number of barriers this WF has joined + int bar_cnt[VSZ]; + int max_bar_cnt; + // Flag to stall a wave on barrier + bool stalledAtBarrier; + + // a pointer to the fraction of the LDS allocated + // to this workgroup (thus this wavefront) + LdsChunk *ldsChunk; + + // A pointer to the spill area + Addr spillBase; + // The size of the spill area + uint32_t spillSizePerItem; + // The vector width of the spill area + uint32_t spillWidth; + + // A pointer to the private memory area + Addr privBase; + // The size of the private memory area + uint32_t privSizePerItem; + + // A pointer ot the read-only memory area + Addr roBase; + // size of the read-only memory area + uint32_t roSize; + + // pointer to buffer for storing kernel arguments + uint8_t *kernelArgs; + // unique WF id over all WFs executed across all CUs + uint64_t wfDynId; + + // number of times instruction issue for this wavefront is blocked + // due to VRF port availability + Stats::Scalar numTimesBlockedDueVrfPortAvail; + // number of times an instruction of a WF is blocked from being issued + // due to WAR and WAW dependencies + Stats::Scalar numTimesBlockedDueWAXDependencies; + // number of times an instruction of a WF is blocked from being issued + // due to WAR and WAW dependencies + Stats::Scalar numTimesBlockedDueRAWDependencies; + // distribution of executed instructions based on their register + // operands; this is used to highlight the load on the VRF + Stats::Distribution srcRegOpDist; + Stats::Distribution dstRegOpDist; + + // Functions to operate on call argument memory + // argument memory for hsail call instruction + CallArgMem *callArgMem; + void + initCallArgMem(int func_args_size_per_item) + { + callArgMem = new CallArgMem(func_args_size_per_item); + } + + template + CType + readCallArgMem(int lane, int addr) + { + return *((CType*)(callArgMem->getLaneAddr(lane, addr))); + } + + template + void + writeCallArgMem(int lane, int addr, CType val) + { + callArgMem->setLaneAddr(lane, addr, val); + } + + typedef WavefrontParams Params; + Wavefront(const Params *p); + ~Wavefront(); + virtual void init(); + + void + setParent(ComputeUnit *cu) + { + computeUnit = cu; + } + + void start(uint64_t _wfDynId, uint64_t _base_ptr); + + void exec(); + void updateResources(); + int ready(itype_e type); + bool instructionBufferHasBranch(); + void regStats(); + VectorMask get_pred() { return execMask() & init_mask; } + + bool waitingAtBarrier(int lane); + + void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, + const VectorMask& exec_mask); + + void popFromReconvergenceStack(); + + uint32_t pc() const; + + uint32_t rpc() const; + + VectorMask execMask() const; + + bool execMask(int lane) const; + + void pc(uint32_t new_pc); + + void discardFetch(); + + private: + /** + * Stack containing Control Flow Graph nodes (i.e., kernel instructions) + * to be visited by the wavefront, and the associated execution masks. The + * reconvergence stack grows every time the wavefront reaches a divergence + * point (branch instruction), and shrinks every time the wavefront + * reaches a reconvergence point (immediate post-dominator instruction). + */ + std::stack> reconvergenceStack; +}; + +#endif // __WAVEFRONT_HH__ -- cgit v1.2.3