summaryrefslogtreecommitdiff
path: root/src/gpu-compute/GPU.py
diff options
context:
space:
mode:
authorTony Gutierrez <anthony.gutierrez@amd.com>2016-01-19 14:28:22 -0500
committerTony Gutierrez <anthony.gutierrez@amd.com>2016-01-19 14:28:22 -0500
commit1a7d3f9fcb76a68540dd948f91413533a383bfde (patch)
tree867510a147cd095f19499d26b7c02d27de4cae9d /src/gpu-compute/GPU.py
parent28e353e0403ea379d244a418e8dc8ee0b48187cf (diff)
downloadgem5-1a7d3f9fcb76a68540dd948f91413533a383bfde.tar.xz
gpu-compute: AMD's baseline GPU model
Diffstat (limited to 'src/gpu-compute/GPU.py')
-rw-r--r--src/gpu-compute/GPU.py310
1 files changed, 310 insertions, 0 deletions
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
new file mode 100644
index 000000000..bd95f6335
--- /dev/null
+++ b/src/gpu-compute/GPU.py
@@ -0,0 +1,310 @@
+#
+# Copyright (c) 2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Steve Reinhardt
+#
+
+from ClockedObject import ClockedObject
+from Device import DmaDevice
+from m5.defines import buildEnv
+from m5.params import *
+from m5.proxy import *
+from m5.SimObject import SimObject
+from MemObject import MemObject
+from Process import EmulatedDriver
+from Bridge import Bridge
+from LdsState import LdsState
+
+class PrefetchType(Enum): vals = [
+ 'PF_CU',
+ 'PF_PHASE',
+ 'PF_WF',
+ 'PF_STRIDE',
+ 'PF_END',
+ ]
+
+class VectorRegisterFile(SimObject):
+ type = 'VectorRegisterFile'
+ cxx_class = 'VectorRegisterFile'
+ cxx_header = 'gpu-compute/vector_register_file.hh'
+
+ simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
+ num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
+ min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
+
+class Wavefront(SimObject):
+ type = 'Wavefront'
+ cxx_class = 'Wavefront'
+ cxx_header = 'gpu-compute/wavefront.hh'
+
+ simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
+ wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
+
+class ComputeUnit(MemObject):
+ type = 'ComputeUnit'
+ cxx_class = 'ComputeUnit'
+ cxx_header = 'gpu-compute/compute_unit.hh'
+
+ wavefronts = VectorParam.Wavefront('Number of wavefronts')
+ wfSize = Param.Int(64, 'Wavefront size (in work items)')
+ num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
+
+ spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
+ 'latency')
+
+ dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\
+ 'latency')
+
+ issue_period = Param.Int(4, 'number of cycles per issue period')
+ num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
+ num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
+ n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
+ mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\
+ "Represents the pipeline to reach the TCP and "\
+ "specified in GPU clock cycles")
+ mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\
+ "cu. Represents the pipeline between the TCP "\
+ "and cu as well as TCP data array access. "\
+ "Specified in GPU clock cycles")
+ system = Param.System(Parent.any, "system object")
+ cu_id = Param.Int('CU id')
+ vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\
+ "in bytes")
+ coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\
+ "in bytes")
+
+ memory_port = VectorMasterPort("Port to the memory system")
+ translation_port = VectorMasterPort('Port to the TLB hierarchy')
+ sqc_port = MasterPort("Port to the SQC (I-cache")
+ sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
+ perLaneTLB = Param.Bool(False, "enable per-lane TLB")
+ prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
+ "(0 turns off prefetching)")
+ prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)")
+ prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\
+ "from last mem req in lane of "\
+ "CU|Phase|Wavefront")
+ execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
+ xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr.");
+ debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
+ functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
+
+ localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
+ "kernel end")
+
+ countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\
+ "and how many times")
+ global_mem_queue_size = Param.Int(256, "Number of entries in the global "
+ "memory pipeline's queues")
+ local_mem_queue_size = Param.Int(256, "Number of entries in the local "
+ "memory pipeline's queues")
+ ldsBus = Bridge() # the bridge between the CU and its LDS
+ ldsPort = MasterPort("The port that goes to the LDS")
+ localDataStore = Param.LdsState("the LDS for this CU")
+
+ vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
+ "file")
+
+class Shader(ClockedObject):
+ type = 'Shader'
+ cxx_class = 'Shader'
+ cxx_header = 'gpu-compute/shader.hh'
+
+ CUs = VectorParam.ComputeUnit('Number of compute units')
+ n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
+ impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
+ ruby at kernel boundaries""")
+ separate_acquire_release = Param.Bool(False,
+ """Do ld_acquire/st_release generate separate requests for the
+ acquire and release?""")
+ globalmem = Param.MemorySize('64kB', 'Memory size')
+ timing = Param.Bool(False, 'timing memory accesses')
+
+ cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
+ translation = Param.Bool(False, "address translation");
+
+class ClDriver(EmulatedDriver):
+ type = 'ClDriver'
+ cxx_header = 'gpu-compute/cl_driver.hh'
+ codefile = VectorParam.String('code file name(s)')
+
+class GpuDispatcher(DmaDevice):
+ type = 'GpuDispatcher'
+ cxx_header = 'gpu-compute/dispatcher.hh'
+ # put at 8GB line for now
+ pio_addr = Param.Addr(0x200000000, "Device Address")
+ pio_latency = Param.Latency('1ns', "Programmed IO latency")
+ shader_pointer = Param.Shader('pointer to shader')
+ translation_port = MasterPort('Port to the dispatcher TLB')
+ cpu = Param.BaseCPU("CPU to wake up on kernel completion")
+
+ cl_driver = Param.ClDriver('pointer to driver')
+
+class OpType(Enum): vals = [
+ 'OT_NULL',
+ 'OT_ALU',
+ 'OT_SPECIAL',
+ 'OT_GLOBAL_READ',
+ 'OT_GLOBAL_WRITE',
+ 'OT_GLOBAL_ATOMIC',
+ 'OT_GLOBAL_HIST',
+ 'OT_GLOBAL_LDAS',
+ 'OT_SHARED_READ',
+ 'OT_SHARED_WRITE',
+ 'OT_SHARED_ATOMIC',
+ 'OT_SHARED_HIST',
+ 'OT_SHARED_LDAS',
+ 'OT_PRIVATE_READ',
+ 'OT_PRIVATE_WRITE',
+ 'OT_PRIVATE_ATOMIC',
+ 'OT_PRIVATE_HIST',
+ 'OT_PRIVATE_LDAS',
+ 'OT_SPILL_READ',
+ 'OT_SPILL_WRITE',
+ 'OT_SPILL_ATOMIC',
+ 'OT_SPILL_HIST',
+ 'OT_SPILL_LDAS',
+ 'OT_READONLY_READ',
+ 'OT_READONLY_WRITE',
+ 'OT_READONLY_ATOMIC',
+ 'OT_READONLY_HIST',
+ 'OT_READONLY_LDAS',
+ 'OT_FLAT_READ',
+ 'OT_FLAT_WRITE',
+ 'OT_FLAT_ATOMIC',
+ 'OT_FLAT_HIST',
+ 'OT_FLAT_LDAS',
+ 'OT_KERN_READ',
+ 'OT_BRANCH',
+
+ # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version
+ # of the compiler.
+ 'OT_SHARED_MEMFENCE',
+ 'OT_GLOBAL_MEMFENCE',
+ 'OT_BOTH_MEMFENCE',
+
+ 'OT_BARRIER',
+ 'OT_PRINT',
+ 'OT_RET',
+ 'OT_NOP',
+ 'OT_ARG'
+ ]
+
+class MemType(Enum): vals = [
+ 'M_U8',
+ 'M_U16',
+ 'M_U32',
+ 'M_U64',
+ 'M_S8',
+ 'M_S16',
+ 'M_S32',
+ 'M_S64',
+ 'M_F16',
+ 'M_F32',
+ 'M_F64',
+ ]
+
+class MemOpType(Enum): vals = [
+ 'MO_LD',
+ 'MO_ST',
+ 'MO_LDAS',
+ 'MO_LDA',
+ 'MO_AAND',
+ 'MO_AOR',
+ 'MO_AXOR',
+ 'MO_ACAS',
+ 'MO_AEXCH',
+ 'MO_AADD',
+ 'MO_ASUB',
+ 'MO_AINC',
+ 'MO_ADEC',
+ 'MO_AMAX',
+ 'MO_AMIN',
+ 'MO_ANRAND',
+ 'MO_ANROR',
+ 'MO_ANRXOR',
+ 'MO_ANRCAS',
+ 'MO_ANREXCH',
+ 'MO_ANRADD',
+ 'MO_ANRSUB',
+ 'MO_ANRINC',
+ 'MO_ANRDEC',
+ 'MO_ANRMAX',
+ 'MO_ANRMIN',
+ 'MO_HAND',
+ 'MO_HOR',
+ 'MO_HXOR',
+ 'MO_HCAS',
+ 'MO_HEXCH',
+ 'MO_HADD',
+ 'MO_HSUB',
+ 'MO_HINC',
+ 'MO_HDEC',
+ 'MO_HMAX',
+ 'MO_HMIN',
+ 'MO_UNDEF'
+ ]
+
+class StorageClassType(Enum): vals = [
+ 'SC_SPILL',
+ 'SC_GLOBAL',
+ 'SC_SHARED',
+ 'SC_PRIVATE',
+ 'SC_READONLY',
+ 'SC_KERNARG',
+ 'SC_NONE',
+ ]
+
+class RegisterType(Enum): vals = [
+ 'RT_VECTOR',
+ 'RT_SCALAR',
+ 'RT_CONDITION',
+ 'RT_HARDWARE',
+ 'RT_NONE',
+ ]
+
+class GenericMemoryOrder(Enum): vals = [
+ 'MEMORY_ORDER_NONE',
+ 'MEMORY_ORDER_RELAXED',
+ 'MEMORY_ORDER_SC_ACQUIRE',
+ 'MEMORY_ORDER_SC_RELEASE',
+ 'MEMORY_ORDER_SC_ACQUIRE_RELEASE',
+ ]
+
+class GenericMemoryScope(Enum): vals = [
+ 'MEMORY_SCOPE_NONE',
+ 'MEMORY_SCOPE_WORKITEM',
+ 'MEMORY_SCOPE_WAVEFRONT',
+ 'MEMORY_SCOPE_WORKGROUP',
+ 'MEMORY_SCOPE_DEVICE',
+ 'MEMORY_SCOPE_SYSTEM',
+ ]