diff options
author | Tony Gutierrez <anthony.gutierrez@amd.com> | 2016-10-26 22:47:11 -0400 |
---|---|---|
committer | Tony Gutierrez <anthony.gutierrez@amd.com> | 2016-10-26 22:47:11 -0400 |
commit | 7ac38849abaf6aeccf39137bc8acb9e44d192e82 (patch) | |
tree | 7658e9d741604b310f871756cf051558b30e115e /src/gpu-compute | |
parent | e1ad8035a379cea98ecef92e78d2894f60b2eedd (diff) | |
download | gem5-7ac38849abaf6aeccf39137bc8acb9e44d192e82.tar.xz |
gpu-compute: remove inst enums and use bit flag for attributes
this patch removes the GPUStaticInst enums that were defined in GPU.py.
instead, a simple set of attribute flags that can be set in the base
instruction class are used. this will help unify the attributes of HSAIL
and machine ISA instructions within the model itself.
because the static instrution now carries the attributes, a GPUDynInst
must carry a pointer to a valid GPUStaticInst so a new static kernel launch
instruction is added, which carries the attributes needed to perform a
the kernel launch.
Diffstat (limited to 'src/gpu-compute')
-rw-r--r-- | src/gpu-compute/GPU.py | 108 | ||||
-rw-r--r-- | src/gpu-compute/GPUStaticInstFlags.py | 111 | ||||
-rw-r--r-- | src/gpu-compute/SConscript | 1 | ||||
-rw-r--r-- | src/gpu-compute/code_enums.hh | 116 | ||||
-rw-r--r-- | src/gpu-compute/compute_unit.cc | 26 | ||||
-rw-r--r-- | src/gpu-compute/compute_unit.hh | 1 | ||||
-rw-r--r-- | src/gpu-compute/global_memory_pipeline.cc | 23 | ||||
-rw-r--r-- | src/gpu-compute/gpu_dyn_inst.cc | 382 | ||||
-rw-r--r-- | src/gpu-compute/gpu_dyn_inst.hh | 219 | ||||
-rw-r--r-- | src/gpu-compute/gpu_static_inst.cc | 6 | ||||
-rw-r--r-- | src/gpu-compute/gpu_static_inst.hh | 167 | ||||
-rw-r--r-- | src/gpu-compute/kernel_cfg.cc | 10 | ||||
-rw-r--r-- | src/gpu-compute/lds_state.cc | 7 | ||||
-rw-r--r-- | src/gpu-compute/lds_state.hh | 1 | ||||
-rw-r--r-- | src/gpu-compute/local_memory_pipeline.cc | 9 | ||||
-rw-r--r-- | src/gpu-compute/shader.hh | 1 | ||||
-rw-r--r-- | src/gpu-compute/vector_register_file.cc | 5 | ||||
-rw-r--r-- | src/gpu-compute/wavefront.cc | 207 |
18 files changed, 802 insertions, 598 deletions
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index f580a09f7..b672f616c 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -171,56 +171,6 @@ class GpuDispatcher(DmaDevice): cl_driver = Param.ClDriver('pointer to driver') -class OpType(Enum): vals = [ - 'OT_NULL', - 'OT_ALU', - 'OT_SPECIAL', - 'OT_GLOBAL_READ', - 'OT_GLOBAL_WRITE', - 'OT_GLOBAL_ATOMIC', - 'OT_GLOBAL_HIST', - 'OT_GLOBAL_LDAS', - 'OT_SHARED_READ', - 'OT_SHARED_WRITE', - 'OT_SHARED_ATOMIC', - 'OT_SHARED_HIST', - 'OT_SHARED_LDAS', - 'OT_PRIVATE_READ', - 'OT_PRIVATE_WRITE', - 'OT_PRIVATE_ATOMIC', - 'OT_PRIVATE_HIST', - 'OT_PRIVATE_LDAS', - 'OT_SPILL_READ', - 'OT_SPILL_WRITE', - 'OT_SPILL_ATOMIC', - 'OT_SPILL_HIST', - 'OT_SPILL_LDAS', - 'OT_READONLY_READ', - 'OT_READONLY_WRITE', - 'OT_READONLY_ATOMIC', - 'OT_READONLY_HIST', - 'OT_READONLY_LDAS', - 'OT_FLAT_READ', - 'OT_FLAT_WRITE', - 'OT_FLAT_ATOMIC', - 'OT_FLAT_HIST', - 'OT_FLAT_LDAS', - 'OT_KERN_READ', - 'OT_BRANCH', - - # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version - # of the compiler. - 'OT_SHARED_MEMFENCE', - 'OT_GLOBAL_MEMFENCE', - 'OT_BOTH_MEMFENCE', - - 'OT_BARRIER', - 'OT_PRINT', - 'OT_RET', - 'OT_NOP', - 'OT_ARG' - ] - class MemType(Enum): vals = [ 'M_U8', 'M_U16', @@ -235,47 +185,6 @@ class MemType(Enum): vals = [ 'M_F64', ] -class MemOpType(Enum): vals = [ - 'MO_LD', - 'MO_ST', - 'MO_LDAS', - 'MO_LDA', - 'MO_AAND', - 'MO_AOR', - 'MO_AXOR', - 'MO_ACAS', - 'MO_AEXCH', - 'MO_AADD', - 'MO_ASUB', - 'MO_AINC', - 'MO_ADEC', - 'MO_AMAX', - 'MO_AMIN', - 'MO_ANRAND', - 'MO_ANROR', - 'MO_ANRXOR', - 'MO_ANRCAS', - 'MO_ANREXCH', - 'MO_ANRADD', - 'MO_ANRSUB', - 'MO_ANRINC', - 'MO_ANRDEC', - 'MO_ANRMAX', - 'MO_ANRMIN', - 'MO_HAND', - 'MO_HOR', - 'MO_HXOR', - 'MO_HCAS', - 'MO_HEXCH', - 'MO_HADD', - 'MO_HSUB', - 'MO_HINC', - 'MO_HDEC', - 'MO_HMAX', - 'MO_HMIN', - 'MO_UNDEF' - ] - class StorageClassType(Enum): vals = [ 'SC_SPILL', 'SC_GLOBAL', @@ -293,20 +202,3 @@ class RegisterType(Enum): vals = [ 'RT_HARDWARE', 'RT_NONE', ] - -class GenericMemoryOrder(Enum): vals = [ - 'MEMORY_ORDER_NONE', - 'MEMORY_ORDER_RELAXED', - 'MEMORY_ORDER_SC_ACQUIRE', - 'MEMORY_ORDER_SC_RELEASE', - 'MEMORY_ORDER_SC_ACQUIRE_RELEASE', - ] - -class GenericMemoryScope(Enum): vals = [ - 'MEMORY_SCOPE_NONE', - 'MEMORY_SCOPE_WORKITEM', - 'MEMORY_SCOPE_WAVEFRONT', - 'MEMORY_SCOPE_WORKGROUP', - 'MEMORY_SCOPE_DEVICE', - 'MEMORY_SCOPE_SYSTEM', - ] diff --git a/src/gpu-compute/GPUStaticInstFlags.py b/src/gpu-compute/GPUStaticInstFlags.py new file mode 100644 index 000000000..453fdced2 --- /dev/null +++ b/src/gpu-compute/GPUStaticInstFlags.py @@ -0,0 +1,111 @@ +# Copyright (c) 2016 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Anthony Gutierrez + +from m5.params import * + +class GPUStaticInstFlags(Enum): + wrapper_name = 'GPUStaticInstFlags' + wrapper_is_struct = True + enum_name = 'Flags' + + vals = [ + # Op types + 'ALU', # ALU op + 'Branch', # Branch instruction + 'Nop', # No-op (no effect at all) + 'Return', # Return instruction + 'UnconditionalJump', # + 'SpecialOp', # Special op + 'Waitcnt', # Is a waitcnt instruction + + # Memory ops + 'MemBarrier', # Barrier instruction + 'MemFence', # Memory fence instruction + 'MemoryRef', # References memory (load, store, or atomic) + 'Flat', # Flat memory op + 'Load', # Reads from memory + 'Store', # Writes to memory + + # Atomic ops + 'AtomicReturn', # Atomic instruction that returns data + 'AtomicNoReturn', # Atomic instruction that doesn't return data + + # Instruction attributes + 'Scalar', # A scalar (not vector) operation + 'ReadsSCC', # The instruction reads SCC + 'WritesSCC', # The instruction writes SCC + 'ReadsVCC', # The instruction reads VCC + 'WritesVCC', # The instruction writes VCC + + # Atomic OP types + 'AtomicAnd', + 'AtomicOr', + 'AtomicXor', + 'AtomicCAS', + 'AtomicExch', + 'AtomicAdd', + 'AtomicSub', + 'AtomicInc', + 'AtomicDec', + 'AtomicMax', + 'AtomicMin', + + # Memory order flags + 'RelaxedOrder', + 'Acquire', # Has acquire semantics + 'Release', # Has release semantics + 'AcquireRelease', # Has acquire and release semantics + 'NoOrder', # Has no ordering restrictions + + # Segment access flags + 'ArgSegment', # Accesses the arg segment + 'GlobalSegment', # Accesses global memory + 'GroupSegment', # Accesses local memory (LDS), aka shared memory + 'KernArgSegment', # Accesses the kernel argument segment + 'PrivateSegment', # Accesses the private segment + 'ReadOnlySegment', # Accesses read only memory + 'SpillSegment', # Accesses the spill segment + 'NoSegment', # Does not have an associated segment + + # Scope flags + 'WorkitemScope', + 'WavefrontScope', + 'WorkgroupScope', + 'DeviceScope', + 'SystemScope', + 'NoScope', # Does not have an associated scope + + # Coherence flags + 'GloballyCoherent', # Coherent with other workitems on same device + 'SystemCoherent' # Coherent with a different device, or the host + ] diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript index 88c1cf036..8cf1ed8cf 100644 --- a/src/gpu-compute/SConscript +++ b/src/gpu-compute/SConscript @@ -41,6 +41,7 @@ if not env['BUILD_GPU']: Return() SimObject('GPU.py') +SimObject('GPUStaticInstFlags.py') SimObject('LdsState.py') SimObject('X86GPUTLB.py') diff --git a/src/gpu-compute/code_enums.hh b/src/gpu-compute/code_enums.hh deleted file mode 100644 index 6cd9bfe26..000000000 --- a/src/gpu-compute/code_enums.hh +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2015 Advanced Micro Devices, Inc. - * All rights reserved. - * - * For use for simulation and test purposes only - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * Author: Anthony Gutierrez - */ - -#ifndef __CODE_ENUMS_HH__ -#define __CODE_ENUMS_HH__ - -#define IS_OT_GLOBAL(a) ((a)>=Enums::OT_GLOBAL_READ \ - && (a)<=Enums::OT_GLOBAL_LDAS) -#define IS_OT_SHARED(a) ((a)>=Enums::OT_SHARED_READ \ - && (a)<=Enums::OT_SHARED_LDAS) -#define IS_OT_PRIVATE(a) ((a)>=Enums::OT_PRIVATE_READ \ - && (a)<=Enums::OT_PRIVATE_LDAS) -#define IS_OT_SPILL(a) ((a)>=Enums::OT_SPILL_READ \ - && (a)<=Enums::OT_SPILL_LDAS) -#define IS_OT_READONLY(a) ((a)>=Enums::OT_READONLY_READ \ - && (a)<=Enums::OT_READONLY_LDAS) -#define IS_OT_FLAT(a) ((a)>=Enums::OT_FLAT_READ && (a)<=Enums::OT_FLAT_LDAS) - -#define IS_OT_LDAS(a) ((a)==Enums::OT_GLOBAL_LDAS||(a)==Enums::OT_SHARED_LDAS \ - ||(a)==Enums::OT_PRIVATE_LDAS||(a)==Enums::OT_SPILL_LDAS \ - ||(a)==Enums::OT_READONLY_LDAS||(a)==Enums::OT_FLAT_LDAS) - -#define IS_OT_READ(a) ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SHARED_READ \ - ||(a)==Enums::OT_PRIVATE_READ||(a)==Enums::OT_SPILL_READ \ - ||(a)==Enums::OT_READONLY_READ||(a)==Enums::OT_FLAT_READ) - -#define IS_OT_READ_GM(a) \ - ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SPILL_READ \ - ||(a)==Enums::OT_READONLY_READ) - -#define IS_OT_READ_LM(a) ((a)==Enums::OT_SHARED_READ) - -#define IS_OT_READ_RM(a) ((a)==Enums::OT_READONLY_READ) - -#define IS_OT_READ_PM(a) ((a)==Enums::OT_PRIVATE_READ) - -#define IS_OT_WRITE(a) \ - ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SHARED_WRITE \ - ||(a)==Enums::OT_PRIVATE_WRITE||(a)==Enums::OT_SPILL_WRITE \ - ||(a)==Enums::OT_READONLY_WRITE||(a)==Enums::OT_FLAT_WRITE) - -#define IS_OT_WRITE_GM(a) \ - ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SPILL_WRITE \ - ||(a)==Enums::OT_READONLY_WRITE) - -#define IS_OT_WRITE_LM(a) ((a)==Enums::OT_SHARED_WRITE) - -#define IS_OT_WRITE_PM(a) ((a)==Enums::OT_PRIVATE_WRITE) - -#define IS_OT_ATOMIC(a) ((a)==Enums::OT_GLOBAL_ATOMIC \ - ||(a)==Enums::OT_SHARED_ATOMIC \ - ||(a)==Enums::OT_PRIVATE_ATOMIC \ - ||(a)==Enums::OT_SPILL_ATOMIC \ - ||(a)==Enums::OT_READONLY_ATOMIC \ - ||(a)==Enums::OT_BOTH_MEMFENCE \ - ||(a)==Enums::OT_FLAT_ATOMIC) - -#define IS_OT_ATOMIC_GM(a) ((a)==Enums::OT_GLOBAL_ATOMIC \ - ||(a)==Enums::OT_SPILL_ATOMIC \ - ||(a)==Enums::OT_READONLY_ATOMIC \ - ||(a)==Enums::OT_GLOBAL_MEMFENCE \ - ||(a)==Enums::OT_BOTH_MEMFENCE) - -#define IS_OT_ATOMIC_LM(a) ((a)==Enums::OT_SHARED_ATOMIC \ - ||(a)==Enums::OT_SHARED_MEMFENCE) - -#define IS_OT_ATOMIC_PM(a) ((a)==Enums::OT_PRIVATE_ATOMIC) - -#define IS_OT_HIST(a) ((a)==Enums::OT_GLOBAL_HIST \ - ||(a)==Enums::OT_SHARED_HIST \ - ||(a)==Enums::OT_PRIVATE_HIST \ - ||(a)==Enums::OT_SPILL_HIST \ - ||(a)==Enums::OT_READONLY_HIST \ - ||(a)==Enums::OT_FLAT_HIST) - -#define IS_OT_HIST_GM(a) ((a)==Enums::OT_GLOBAL_HIST \ - ||(a)==Enums::OT_SPILL_HIST \ - ||(a)==Enums::OT_READONLY_HIST) - -#define IS_OT_HIST_LM(a) ((a)==Enums::OT_SHARED_HIST) - -#define IS_OT_HIST_PM(a) ((a)==Enums::OT_PRIVATE_HIST) - -#endif // __CODE_ENUMS_HH__ diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 97e018713..abf8ff2c5 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -75,7 +75,8 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p), req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()), resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()), _masterId(p->system->getMasterId(name() + ".ComputeUnit")), - lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize) + lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize), + kernelLaunchInst(new KernelLaunchStaticInst()) { /** * This check is necessary because std::bitset only provides conversion @@ -316,13 +317,11 @@ ComputeUnit::StartWorkgroup(NDRange *ndr) // Send L1 cache acquire // isKernel + isAcquire = Kernel Begin if (shader->impl_kern_boundary_sync) { - GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(this, - nullptr, - nullptr, 0); + GPUDynInstPtr gpuDynInst = + std::make_shared<GPUDynInst>(this, nullptr, kernelLaunchInst, + getAndIncSeqNum()); gpuDynInst->useContinuation = false; - gpuDynInst->memoryOrder = Enums::MEMORY_ORDER_SC_ACQUIRE; - gpuDynInst->scope = Enums::MEMORY_SCOPE_SYSTEM; injectGlobalMemFence(gpuDynInst, true); } @@ -647,7 +646,7 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) gpuDynInst->wfSlotId, w->barrierCnt); if (gpuDynInst->useContinuation) { - assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + assert(!gpuDynInst->isNoScope()); gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), gpuDynInst); } @@ -658,7 +657,7 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) return true; } else if (pkt->req->isKernel() && pkt->req->isAcquire()) { if (gpuDynInst->useContinuation) { - assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + assert(!gpuDynInst->isNoScope()); gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), gpuDynInst); } @@ -942,6 +941,8 @@ void ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch, Request* req) { + assert(gpuDynInst->isGlobalSeg()); + if (!req) { req = new Request(0, 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId); } @@ -950,8 +951,6 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch, req->setFlags(Request::KERNEL); } - gpuDynInst->s_type = SEG_GLOBAL; - // for non-kernel MemFence operations, memorder flags are set depending // on which type of request is currently being sent, so this // should be set by the caller (e.g. if an inst has acq-rel @@ -1033,8 +1032,7 @@ ComputeUnit::DataPort::MemRespEvent::process() if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) gpuDynInst->statusVector.clear(); - if (gpuDynInst->m_op == Enums::MO_LD || MO_A(gpuDynInst->m_op) - || MO_ANR(gpuDynInst->m_op)) { + if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) { assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy()); compute_unit->globalMemoryPipe.getGMLdRespFIFO() @@ -1055,7 +1053,7 @@ ComputeUnit::DataPort::MemRespEvent::process() // the continuation may generate more work for // this memory request if (gpuDynInst->useContinuation) { - assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + assert(!gpuDynInst->isNoScope()); gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), gpuDynInst); } @@ -1065,7 +1063,7 @@ ComputeUnit::DataPort::MemRespEvent::process() gpuDynInst->statusBitVector = VectorMask(0); if (gpuDynInst->useContinuation) { - assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + assert(!gpuDynInst->isNoScope()); gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), gpuDynInst); } diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index a3547402a..938658fd1 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -744,6 +744,7 @@ class ComputeUnit : public MemObject private: uint64_t globalSeqNum; int wavefrontSize; + GPUStaticInst *kernelLaunchInst; }; #endif // __COMPUTE_UNIT_HH__ diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc index 102905ec8..ab3e8c47e 100644 --- a/src/gpu-compute/global_memory_pipeline.cc +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -67,7 +67,7 @@ GlobalMemPipeline::exec() bool accessVrf = true; // check the VRF to see if the operands of a load (or load component // of an atomic) are accessible - if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) { + if ((m) && (m->isLoad() || m->isAtomicRet())) { Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; accessVrf = @@ -127,10 +127,7 @@ GlobalMemPipeline::exec() // memory packets to DTLB if (!gmIssuedRequests.empty()) { GPUDynInstPtr mp = gmIssuedRequests.front(); - if (mp->m_op == Enums::MO_LD || - (mp->m_op >= Enums::MO_AAND && mp->m_op <= Enums::MO_AMIN) || - (mp->m_op >= Enums::MO_ANRAND && mp->m_op <= Enums::MO_ANRMIN)) { - + if (mp->isLoad() || mp->isAtomic()) { if (inflightLoads >= gmQueueSize) { return; } else { @@ -139,7 +136,7 @@ GlobalMemPipeline::exec() } else { if (inflightStores >= gmQueueSize) { return; - } else if (mp->m_op == Enums::MO_ST) { + } else if (mp->isStore()) { ++inflightStores; } } @@ -147,9 +144,8 @@ GlobalMemPipeline::exec() mp->initiateAcc(mp); gmIssuedRequests.pop(); - DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = %s\n", - computeUnit->cu_id, mp->simdId, mp->wfSlotId, - Enums::MemOpTypeStrings[mp->m_op]); + DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n", + computeUnit->cu_id, mp->simdId, mp->wfSlotId); } } @@ -160,12 +156,12 @@ GlobalMemPipeline::doGmReturn(GPUDynInstPtr m) Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; // Return data to registers - if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + if (m->isLoad() || m->isAtomic()) { gmReturnedLoads.pop(); assert(inflightLoads > 0); --inflightLoads; - if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) { + if (m->isLoad() || m->isAtomicRet()) { std::vector<uint32_t> regVec; // iterate over number of destination register operands since // this is a load or atomic operation @@ -214,13 +210,12 @@ GlobalMemPipeline::doGmReturn(GPUDynInstPtr m) // Decrement outstanding register count computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); - if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) || - MO_H(m->m_op)) { + if (m->isStore() || m->isAtomic()) { computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm, m->time, -1); } - if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + if (m->isLoad() || m->isAtomic()) { computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm, m->time, -1); } diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 1806e79e4..ec6340360 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -41,11 +41,10 @@ #include "gpu-compute/wavefront.hh" GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, - GPUStaticInst *_staticInst, uint64_t instSeqNum) + GPUStaticInst *static_inst, uint64_t instSeqNum) : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0), - m_op(Enums::MO_UNDEF), - memoryOrder(Enums::MEMORY_ORDER_NONE), n_reg(0), useContinuation(false), - statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum) + n_reg(0), useContinuation(false), + statusBitVector(0), _staticInst(static_inst), _seqNum(instSeqNum) { tlbHitLevel.assign(computeUnit()->wfSize(), -1); d_data = new uint8_t[computeUnit()->wfSize() * 16]; @@ -68,77 +67,69 @@ GPUDynInst::~GPUDynInst() } void -GPUDynInst::execute() +GPUDynInst::execute(GPUDynInstPtr gpuDynInst) { - GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(cu, wf, staticInst, - _seqNum); - staticInst->execute(gpuDynInst); + _staticInst->execute(gpuDynInst); } int GPUDynInst::numSrcRegOperands() { - return staticInst->numSrcRegOperands(); + return _staticInst->numSrcRegOperands(); } int GPUDynInst::numDstRegOperands() { - return staticInst->numDstRegOperands(); + return _staticInst->numDstRegOperands(); } int GPUDynInst::getNumOperands() { - return staticInst->getNumOperands(); + return _staticInst->getNumOperands(); } bool GPUDynInst::isVectorRegister(int operandIdx) { - return staticInst->isVectorRegister(operandIdx); + return _staticInst->isVectorRegister(operandIdx); } bool GPUDynInst::isScalarRegister(int operandIdx) { - return staticInst->isScalarRegister(operandIdx); + return _staticInst->isScalarRegister(operandIdx); } int GPUDynInst::getRegisterIndex(int operandIdx) { - return staticInst->getRegisterIndex(operandIdx); + return _staticInst->getRegisterIndex(operandIdx); } int GPUDynInst::getOperandSize(int operandIdx) { - return staticInst->getOperandSize(operandIdx); + return _staticInst->getOperandSize(operandIdx); } bool GPUDynInst::isDstOperand(int operandIdx) { - return staticInst->isDstOperand(operandIdx); + return _staticInst->isDstOperand(operandIdx); } bool GPUDynInst::isSrcOperand(int operandIdx) { - return staticInst->isSrcOperand(operandIdx); -} - -bool -GPUDynInst::isArgLoad() -{ - return staticInst->isArgLoad(); + return _staticInst->isSrcOperand(operandIdx); } const std::string& GPUDynInst::disassemble() const { - return staticInst->disassemble(); + return _staticInst->disassemble(); } uint64_t @@ -147,16 +138,10 @@ GPUDynInst::seqNum() const return _seqNum; } -Enums::OpType -GPUDynInst::opType() -{ - return staticInst->o_type; -} - Enums::StorageClassType GPUDynInst::executedAs() { - return staticInst->executed_as; + return _staticInst->executed_as; } // Process a memory instruction and (if necessary) submit timing request @@ -166,20 +151,347 @@ GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst) DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n", cu->cu_id, simdId, wfSlotId, exec_mask); - staticInst->initiateAcc(gpuDynInst); + _staticInst->initiateAcc(gpuDynInst); time = 0; } +/** + * accessor methods for the attributes of + * the underlying GPU static instruction + */ +bool +GPUDynInst::isALU() const +{ + return _staticInst->isALU(); +} + +bool +GPUDynInst::isBranch() const +{ + return _staticInst->isBranch(); +} + +bool +GPUDynInst::isNop() const +{ + return _staticInst->isNop(); +} + +bool +GPUDynInst::isReturn() const +{ + return _staticInst->isReturn(); +} + +bool +GPUDynInst::isUnconditionalJump() const +{ + return _staticInst->isUnconditionalJump(); +} + +bool +GPUDynInst::isSpecialOp() const +{ + return _staticInst->isSpecialOp(); +} + +bool +GPUDynInst::isWaitcnt() const +{ + return _staticInst->isWaitcnt(); +} + +bool +GPUDynInst::isBarrier() const +{ + return _staticInst->isBarrier(); +} + +bool +GPUDynInst::isMemFence() const +{ + return _staticInst->isMemFence(); +} + +bool +GPUDynInst::isMemRef() const +{ + return _staticInst->isMemRef(); +} + +bool +GPUDynInst::isFlat() const +{ + return _staticInst->isFlat(); +} + +bool +GPUDynInst::isLoad() const +{ + return _staticInst->isLoad(); +} + +bool +GPUDynInst::isStore() const +{ + return _staticInst->isStore(); +} + +bool +GPUDynInst::isAtomic() const +{ + return _staticInst->isAtomic(); +} + +bool +GPUDynInst::isAtomicNoRet() const +{ + return _staticInst->isAtomicNoRet(); +} + +bool +GPUDynInst::isAtomicRet() const +{ + return _staticInst->isAtomicRet(); +} + +bool +GPUDynInst::isScalar() const +{ + return _staticInst->isScalar(); +} + +bool +GPUDynInst::readsSCC() const +{ + return _staticInst->readsSCC(); +} + +bool +GPUDynInst::writesSCC() const +{ + return _staticInst->writesSCC(); +} + +bool +GPUDynInst::readsVCC() const +{ + return _staticInst->readsVCC(); +} + +bool +GPUDynInst::writesVCC() const +{ + return _staticInst->writesVCC(); +} + +bool +GPUDynInst::isAtomicAnd() const +{ + return _staticInst->isAtomicAnd(); +} + +bool +GPUDynInst::isAtomicOr() const +{ + return _staticInst->isAtomicOr(); +} + +bool +GPUDynInst::isAtomicXor() const +{ + return _staticInst->isAtomicXor(); +} + +bool +GPUDynInst::isAtomicCAS() const +{ + return _staticInst->isAtomicCAS(); +} + +bool GPUDynInst::isAtomicExch() const +{ + return _staticInst->isAtomicExch(); +} + +bool +GPUDynInst::isAtomicAdd() const +{ + return _staticInst->isAtomicAdd(); +} + +bool +GPUDynInst::isAtomicSub() const +{ + return _staticInst->isAtomicSub(); +} + +bool +GPUDynInst::isAtomicInc() const +{ + return _staticInst->isAtomicInc(); +} + +bool +GPUDynInst::isAtomicDec() const +{ + return _staticInst->isAtomicDec(); +} + +bool +GPUDynInst::isAtomicMax() const +{ + return _staticInst->isAtomicMax(); +} + +bool +GPUDynInst::isAtomicMin() const +{ + return _staticInst->isAtomicMin(); +} + +bool +GPUDynInst::isArgLoad() const +{ + return _staticInst->isArgLoad(); +} + +bool +GPUDynInst::isGlobalMem() const +{ + return _staticInst->isGlobalMem(); +} + +bool +GPUDynInst::isLocalMem() const +{ + return _staticInst->isLocalMem(); +} + +bool +GPUDynInst::isArgSeg() const +{ + return _staticInst->isArgSeg(); +} + +bool +GPUDynInst::isGlobalSeg() const +{ + return _staticInst->isGlobalSeg(); +} + +bool +GPUDynInst::isGroupSeg() const +{ + return _staticInst->isGroupSeg(); +} + +bool +GPUDynInst::isKernArgSeg() const +{ + return _staticInst->isKernArgSeg(); +} + +bool +GPUDynInst::isPrivateSeg() const +{ + return _staticInst->isPrivateSeg(); +} + +bool +GPUDynInst::isReadOnlySeg() const +{ + return _staticInst->isReadOnlySeg(); +} + +bool +GPUDynInst::isSpillSeg() const +{ + return _staticInst->isSpillSeg(); +} + +bool +GPUDynInst::isWorkitemScope() const +{ + return _staticInst->isWorkitemScope(); +} + +bool +GPUDynInst::isWavefrontScope() const +{ + return _staticInst->isWavefrontScope(); +} + +bool +GPUDynInst::isWorkgroupScope() const +{ + return _staticInst->isWorkgroupScope(); +} + +bool +GPUDynInst::isDeviceScope() const +{ + return _staticInst->isDeviceScope(); +} + +bool +GPUDynInst::isSystemScope() const +{ + return _staticInst->isSystemScope(); +} + +bool +GPUDynInst::isNoScope() const +{ + return _staticInst->isNoScope(); +} + +bool +GPUDynInst::isRelaxedOrder() const +{ + return _staticInst->isRelaxedOrder(); +} + +bool +GPUDynInst::isAcquire() const +{ + return _staticInst->isAcquire(); +} + +bool +GPUDynInst::isRelease() const +{ + return _staticInst->isRelease(); +} + +bool +GPUDynInst::isAcquireRelease() const +{ + return _staticInst->isAcquireRelease(); +} + +bool +GPUDynInst::isNoOrder() const +{ + return _staticInst->isNoOrder(); +} + +bool +GPUDynInst::isGloballyCoherent() const +{ + return _staticInst->isGloballyCoherent(); +} + bool -GPUDynInst::scalarOp() const +GPUDynInst::isSystemCoherent() const { - return staticInst->scalarOp(); + return _staticInst->isSystemCoherent(); } void GPUDynInst::updateStats() { - if (staticInst->isLocalMem()) { + if (_staticInst->isLocalMem()) { // access to LDS (shared) memory cu->dynamicLMemInstrCnt++; } else { diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh index 46774d867..c07d85d78 100644 --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -39,11 +39,7 @@ #include <cstdint> #include <string> -#include "enums/GenericMemoryOrder.hh" -#include "enums/GenericMemoryScope.hh" -#include "enums/MemOpType.hh" #include "enums/MemType.hh" -#include "enums/OpType.hh" #include "enums/StorageClassType.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_exec_context.hh" @@ -180,33 +176,19 @@ class AtomicOpMin : public TypedAtomicOpFunctor<T> } }; -#define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN) -#define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN) -#define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN) - typedef enum { VT_32, VT_64, } vgpr_type; -typedef enum -{ - SEG_PRIVATE, - SEG_SPILL, - SEG_GLOBAL, - SEG_SHARED, - SEG_READONLY, - SEG_FLAT -} seg_type; - class GPUDynInst : public GPUExecContext { public: - GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst, + GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst, uint64_t instSeqNum); ~GPUDynInst(); - void execute(); + void execute(GPUDynInstPtr gpuDynInst); int numSrcRegOperands(); int numDstRegOperands(); int getNumOperands(); @@ -216,13 +198,11 @@ class GPUDynInst : public GPUExecContext int getOperandSize(int operandIdx); bool isDstOperand(int operandIdx); bool isSrcOperand(int operandIdx); - bool isArgLoad(); const std::string &disassemble() const; uint64_t seqNum() const; - Enums::OpType opType(); Enums::StorageClassType executedAs(); // The address of the memory operation @@ -240,14 +220,7 @@ class GPUDynInst : public GPUExecContext // The memory type (M_U32, M_S32, ...) Enums::MemType m_type; - // The memory operation (MO_LD, MO_ST, ...) - Enums::MemOpType m_op; - Enums::GenericMemoryOrder memoryOrder; - - // Scope of the request - Enums::GenericMemoryScope scope; - // The memory segment (SEG_SHARED, SEG_GLOBAL, ...) - seg_type s_type; + // The equivalency class int equiv; // The return VGPR type (VT_32 or VT_64) @@ -288,10 +261,72 @@ class GPUDynInst : public GPUExecContext void updateStats(); - GPUStaticInst* staticInstruction() { return staticInst; } - - // Is the instruction a scalar or vector op? - bool scalarOp() const; + GPUStaticInst* staticInstruction() { return _staticInst; } + + bool isALU() const; + bool isBranch() const; + bool isNop() const; + bool isReturn() const; + bool isUnconditionalJump() const; + bool isSpecialOp() const; + bool isWaitcnt() const; + + bool isBarrier() const; + bool isMemFence() const; + bool isMemRef() const; + bool isFlat() const; + bool isLoad() const; + bool isStore() const; + + bool isAtomic() const; + bool isAtomicNoRet() const; + bool isAtomicRet() const; + + bool isScalar() const; + bool readsSCC() const; + bool writesSCC() const; + bool readsVCC() const; + bool writesVCC() const; + + bool isAtomicAnd() const; + bool isAtomicOr() const; + bool isAtomicXor() const; + bool isAtomicCAS() const; + bool isAtomicExch() const; + bool isAtomicAdd() const; + bool isAtomicSub() const; + bool isAtomicInc() const; + bool isAtomicDec() const; + bool isAtomicMax() const; + bool isAtomicMin() const; + + bool isArgLoad() const; + bool isGlobalMem() const; + bool isLocalMem() const; + + bool isArgSeg() const; + bool isGlobalSeg() const; + bool isGroupSeg() const; + bool isKernArgSeg() const; + bool isPrivateSeg() const; + bool isReadOnlySeg() const; + bool isSpillSeg() const; + + bool isWorkitemScope() const; + bool isWavefrontScope() const; + bool isWorkgroupScope() const; + bool isDeviceScope() const; + bool isSystemScope() const; + bool isNoScope() const; + + bool isRelaxedOrder() const; + bool isAcquire() const; + bool isRelease() const; + bool isAcquireRelease() const; + bool isNoOrder() const; + + bool isGloballyCoherent() const; + bool isSystemCoherent() const; /* * Loads/stores/atomics may have acquire/release semantics associated @@ -312,46 +347,32 @@ class GPUDynInst : public GPUExecContext bool useContinuation; template<typename c0> AtomicOpFunctor* - makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op) + makeAtomicOpFunctor(c0 *reg0, c0 *reg1) { - using namespace Enums; - - switch(op) { - case MO_AAND: - case MO_ANRAND: + if (isAtomicAnd()) { return new AtomicOpAnd<c0>(*reg0); - case MO_AOR: - case MO_ANROR: + } else if (isAtomicOr()) { return new AtomicOpOr<c0>(*reg0); - case MO_AXOR: - case MO_ANRXOR: + } else if (isAtomicXor()) { return new AtomicOpXor<c0>(*reg0); - case MO_ACAS: - case MO_ANRCAS: + } else if (isAtomicCAS()) { return new AtomicOpCAS<c0>(*reg0, *reg1, cu); - case MO_AEXCH: - case MO_ANREXCH: + } else if (isAtomicExch()) { return new AtomicOpExch<c0>(*reg0); - case MO_AADD: - case MO_ANRADD: + } else if (isAtomicAdd()) { return new AtomicOpAdd<c0>(*reg0); - case MO_ASUB: - case MO_ANRSUB: + } else if (isAtomicSub()) { return new AtomicOpSub<c0>(*reg0); - case MO_AINC: - case MO_ANRINC: + } else if (isAtomicInc()) { return new AtomicOpInc<c0>(); - case MO_ADEC: - case MO_ANRDEC: + } else if (isAtomicDec()) { return new AtomicOpDec<c0>(); - case MO_AMAX: - case MO_ANRMAX: + } else if (isAtomicMax()) { return new AtomicOpMax<c0>(*reg0); - case MO_AMIN: - case MO_ANRMIN: + } else if (isAtomicMin()) { return new AtomicOpMin<c0>(*reg0); - default: - panic("Unrecognized atomic operation"); + } else { + fatal("Unrecognized atomic operation"); } } @@ -359,88 +380,58 @@ class GPUDynInst : public GPUExecContext setRequestFlags(Request *req, bool setMemOrder=true) { // currently these are the easy scopes to deduce - switch (s_type) { - case SEG_PRIVATE: + if (isPrivateSeg()) { req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT); - break; - case SEG_SPILL: + } else if (isSpillSeg()) { req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT); - break; - case SEG_GLOBAL: + } else if (isGlobalSeg()) { req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT); - break; - case SEG_READONLY: + } else if (isReadOnlySeg()) { req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT); - break; - case SEG_SHARED: + } else if (isGroupSeg()) { req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT); - break; - case SEG_FLAT: + } else if (isFlat()) { // TODO: translate to correct scope assert(false); - default: - panic("Bad segment type"); - break; + } else { + fatal("%s has bad segment type\n", disassemble()); } - switch (scope) { - case Enums::MEMORY_SCOPE_NONE: - case Enums::MEMORY_SCOPE_WORKITEM: - break; - case Enums::MEMORY_SCOPE_WAVEFRONT: + if (isWavefrontScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::WAVEFRONT_SCOPE); - break; - case Enums::MEMORY_SCOPE_WORKGROUP: + } else if (isWorkgroupScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::WORKGROUP_SCOPE); - break; - case Enums::MEMORY_SCOPE_DEVICE: + } else if (isDeviceScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::DEVICE_SCOPE); - break; - case Enums::MEMORY_SCOPE_SYSTEM: + } else if (isSystemScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::SYSTEM_SCOPE); - break; - default: - panic("Bad scope type"); - break; + } else if (!isNoScope() && !isWorkitemScope()) { + fatal("%s has bad scope type\n", disassemble()); } if (setMemOrder) { // set acquire and release flags - switch (memoryOrder){ - case Enums::MEMORY_ORDER_SC_ACQUIRE: + if (isAcquire()) { req->setFlags(Request::ACQUIRE); - break; - case Enums::MEMORY_ORDER_SC_RELEASE: + } else if (isRelease()) { req->setFlags(Request::RELEASE); - break; - case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE: + } else if (isAcquireRelease()) { req->setFlags(Request::ACQUIRE | Request::RELEASE); - break; - default: - break; + } else if (!isNoOrder()) { + fatal("%s has bad memory order\n", disassemble()); } } // set atomic type // currently, the instruction genenerator only produces atomic return // but a magic instruction can produce atomic no return - if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB || - m_op == Enums::MO_AAND || m_op == Enums::MO_AOR || - m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX || - m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC || - m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH || - m_op == Enums::MO_ACAS) { + if (isAtomicRet()) { req->setFlags(Request::ATOMIC_RETURN_OP); - } else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB || - m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR || - m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX || - m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC || - m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH || - m_op == Enums::MO_ANRCAS) { + } else if (isAtomicNoRet()) { req->setFlags(Request::ATOMIC_NO_RETURN_OP); } } @@ -457,7 +448,7 @@ class GPUDynInst : public GPUExecContext std::vector<int> tlbHitLevel; private: - GPUStaticInst *staticInst; + GPUStaticInst *_staticInst; uint64_t _seqNum; }; diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc index 83b429e62..0f74bd532 100644 --- a/src/gpu-compute/gpu_static_inst.cc +++ b/src/gpu-compute/gpu_static_inst.cc @@ -36,10 +36,12 @@ #include "gpu-compute/gpu_static_inst.hh" GPUStaticInst::GPUStaticInst(const std::string &opcode) - : o_type(Enums::OT_ALU), executed_as(Enums::SC_NONE), opcode(opcode), - _instNum(0), _scalarOp(false) + : executed_as(Enums::SC_NONE), opcode(opcode), + _instNum(0) { + setFlag(NoOrder); } + const std::string& GPUStaticInst::disassemble() { diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh index 911e4f308..a73ec12e3 100644 --- a/src/gpu-compute/gpu_static_inst.hh +++ b/src/gpu-compute/gpu_static_inst.hh @@ -48,7 +48,7 @@ #include <cstdint> #include <string> -#include "enums/OpType.hh" +#include "enums/GPUStaticInstFlags.hh" #include "enums/StorageClassType.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/misc.hh" @@ -57,7 +57,7 @@ class BaseOperand; class BaseRegOperand; class Wavefront; -class GPUStaticInst +class GPUStaticInst : public GPUStaticInstFlags { public: GPUStaticInst(const std::string &opcode); @@ -86,22 +86,110 @@ class GPUStaticInst virtual bool isValid() const = 0; - /* - * Most instructions (including all HSAIL instructions) - * are vector ops, so _scalarOp will be false by default. - * Derived instruction objects that are scalar ops must - * set _scalarOp to true in their constructors. - */ - bool scalarOp() const { return _scalarOp; } + bool isALU() const { return _flags[ALU]; } + bool isBranch() const { return _flags[Branch]; } + bool isNop() const { return _flags[Nop]; } + bool isReturn() const { return _flags[Return]; } + + bool + isUnconditionalJump() const + { + return _flags[UnconditionalJump]; + } + + bool isSpecialOp() const { return _flags[SpecialOp]; } + bool isWaitcnt() const { return _flags[Waitcnt]; } + + bool isBarrier() const { return _flags[MemBarrier]; } + bool isMemFence() const { return _flags[MemFence]; } + bool isMemRef() const { return _flags[MemoryRef]; } + bool isFlat() const { return _flags[Flat]; } + bool isLoad() const { return _flags[Load]; } + bool isStore() const { return _flags[Store]; } + + bool + isAtomic() const + { + return _flags[AtomicReturn] || _flags[AtomicNoReturn]; + } + + bool isAtomicNoRet() const { return _flags[AtomicNoReturn]; } + bool isAtomicRet() const { return _flags[AtomicReturn]; } + + bool isScalar() const { return _flags[Scalar]; } + bool readsSCC() const { return _flags[ReadsSCC]; } + bool writesSCC() const { return _flags[WritesSCC]; } + bool readsVCC() const { return _flags[ReadsVCC]; } + bool writesVCC() const { return _flags[WritesVCC]; } - virtual bool isLocalMem() const + bool isAtomicAnd() const { return _flags[AtomicAnd]; } + bool isAtomicOr() const { return _flags[AtomicOr]; } + bool isAtomicXor() const { return _flags[AtomicXor]; } + bool isAtomicCAS() const { return _flags[AtomicCAS]; } + bool isAtomicExch() const { return _flags[AtomicExch]; } + bool isAtomicAdd() const { return _flags[AtomicAdd]; } + bool isAtomicSub() const { return _flags[AtomicSub]; } + bool isAtomicInc() const { return _flags[AtomicInc]; } + bool isAtomicDec() const { return _flags[AtomicDec]; } + bool isAtomicMax() const { return _flags[AtomicMax]; } + bool isAtomicMin() const { return _flags[AtomicMin]; } + + bool + isArgLoad() const + { + return (_flags[KernArgSegment] || _flags[ArgSegment]) && _flags[Load]; + } + + bool + isGlobalMem() const { - fatal("calling isLocalMem() on non-memory instruction.\n"); + return _flags[MemoryRef] && (_flags[GlobalSegment] || + _flags[PrivateSegment] || _flags[ReadOnlySegment] || + _flags[SpillSegment]); + } - return false; + bool + isLocalMem() const + { + return _flags[MemoryRef] && _flags[GroupSegment]; } - bool isArgLoad() { return false; } + bool isArgSeg() const { return _flags[ArgSegment]; } + bool isGlobalSeg() const { return _flags[GlobalSegment]; } + bool isGroupSeg() const { return _flags[GroupSegment]; } + bool isKernArgSeg() const { return _flags[KernArgSegment]; } + bool isPrivateSeg() const { return _flags[PrivateSegment]; } + bool isReadOnlySeg() const { return _flags[ReadOnlySegment]; } + bool isSpillSeg() const { return _flags[SpillSegment]; } + + bool isWorkitemScope() const { return _flags[WorkitemScope]; } + bool isWavefrontScope() const { return _flags[WavefrontScope]; } + bool isWorkgroupScope() const { return _flags[WorkgroupScope]; } + bool isDeviceScope() const { return _flags[DeviceScope]; } + bool isSystemScope() const { return _flags[SystemScope]; } + bool isNoScope() const { return _flags[NoScope]; } + + bool isRelaxedOrder() const { return _flags[RelaxedOrder]; } + bool isAcquire() const { return _flags[Acquire]; } + bool isRelease() const { return _flags[Release]; } + bool isAcquireRelease() const { return _flags[AcquireRelease]; } + bool isNoOrder() const { return _flags[NoOrder]; } + + /** + * Coherence domain of a memory instruction. Only valid for + * machine ISA. The coherence domain specifies where it is + * possible to perform memory synchronization, e.g., acquire + * or release, from the shader kernel. + * + * isGloballyCoherent(): returns true if kernel is sharing memory + * with other work-items on the same device (GPU) + * + * isSystemCoherent(): returns true if kernel is sharing memory + * with other work-items on a different device (GPU) or the host (CPU) + */ + bool isGloballyCoherent() const { return _flags[GloballyCoherent]; } + bool isSystemCoherent() const { return _flags[SystemCoherent]; } + virtual uint32_t instSize() = 0; // only used for memory instructions @@ -120,22 +208,13 @@ class GPUStaticInst virtual uint32_t getTargetPc() { return 0; } - /** - * Query whether the instruction is an unconditional jump i.e., the jump - * is always executed because there is no condition to be evaluated. - * - * If the instruction is not of branch type, the result is always false. - * - * @return True if the instruction is an unconditional jump. - */ - virtual bool unconditionalJumpInstruction() { return false; } - static uint64_t dynamic_id_count; - Enums::OpType o_type; // For flat memory accesses Enums::StorageClassType executed_as; + void setFlag(Flags flag) { _flags[flag] = true; } + protected: virtual void execLdAcq(GPUDynInstPtr gpuDynInst) @@ -169,7 +248,45 @@ class GPUStaticInst */ int _ipdInstNum; - bool _scalarOp; + std::bitset<Num_Flags> _flags; +}; + +class KernelLaunchStaticInst : public GPUStaticInst +{ + public: + KernelLaunchStaticInst() : GPUStaticInst("kernel_launch") + { + setFlag(Nop); + setFlag(Scalar); + setFlag(Acquire); + setFlag(SystemScope); + setFlag(GlobalSegment); + } + + void + execute(GPUDynInstPtr gpuDynInst) + { + fatal("kernel launch instruction should not be executed\n"); + } + + void + generateDisassembly() + { + disassembly = opcode; + } + + int getNumOperands() { return 0; } + bool isCondRegister(int operandIndex) { return false; } + bool isScalarRegister(int operandIndex) { return false; } + bool isVectorRegister(int operandIndex) { return false; } + bool isSrcOperand(int operandIndex) { return false; } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) { return 0; } + int getRegisterIndex(int operandIndex) { return 0; } + int numDstRegOperands() { return 0; } + int numSrcRegOperands() { return 0; } + bool isValid() const { return true; } + uint32_t instSize() { return 0; } }; #endif // __GPU_STATIC_INST_HH__ diff --git a/src/gpu-compute/kernel_cfg.cc b/src/gpu-compute/kernel_cfg.cc index 10ded11b7..ac6a81b16 100644 --- a/src/gpu-compute/kernel_cfg.cc +++ b/src/gpu-compute/kernel_cfg.cc @@ -104,7 +104,7 @@ ControlFlowInfo::createBasicBlocks() leaders.insert(0); for (int i = 1; i < instructions.size(); i++) { GPUStaticInst* instruction = instructions[i]; - if (instruction->o_type == Enums::OT_BRANCH) { + if (instruction->isBranch()) { const int target_pc = instruction->getTargetPc(); leaders.insert(target_pc); leaders.insert(i + 1); @@ -137,18 +137,18 @@ ControlFlowInfo::connectBasicBlocks() break; } GPUStaticInst* last = lastInstruction(bb.get()); - if (last->o_type == Enums::OT_RET) { + if (last->isReturn()) { bb->successorIds.insert(exit_bb->id); continue; } - if (last->o_type == Enums::OT_BRANCH) { + if (last->isBranch()) { const uint32_t target_pc = last->getTargetPc(); BasicBlock* target_bb = basicBlock(target_pc); bb->successorIds.insert(target_bb->id); } // Unconditional jump instructions have a unique successor - if (!last->unconditionalJumpInstruction()) { + if (!last->isUnconditionalJump()) { BasicBlock* next_bb = basicBlock(last->instNum() + 1); bb->successorIds.insert(next_bb->id); } @@ -274,7 +274,7 @@ ControlFlowInfo::printBasicBlocks() const int inst_num = inst->instNum(); std::cout << inst_num << " [" << basicBlock(inst_num)->id << "]: " << inst->disassemble(); - if (inst->o_type == Enums::OT_BRANCH) { + if (inst->isBranch()) { std::cout << ", PC = " << inst->getTargetPc(); } std::cout << std::endl; diff --git a/src/gpu-compute/lds_state.cc b/src/gpu-compute/lds_state.cc index d4a27318a..fad98c886 100644 --- a/src/gpu-compute/lds_state.cc +++ b/src/gpu-compute/lds_state.cc @@ -141,8 +141,7 @@ LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst, } } - if (gpuDynInst->m_op == Enums::MO_LD || - gpuDynInst->m_op == Enums::MO_ST) { + if (gpuDynInst->isLoad() || gpuDynInst->isStore()) { // mask identical addresses for (int j = 0; j < numBanks; ++j) { for (int j0 = 0; j0 < j; j0++) { @@ -208,8 +207,8 @@ LdsState::processPacket(PacketPtr packet) GPUDynInstPtr dynInst = getDynInstr(packet); // account for the LDS bank conflict overhead - int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() : - (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() : + int busLength = (dynInst->isLoad()) ? parent->loadBusLength() : + (dynInst->isStore()) ? parent->storeBusLength() : parent->loadBusLength(); // delay for accessing the LDS Tick processingTime = diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh index 58d109493..5fcbe82c0 100644 --- a/src/gpu-compute/lds_state.hh +++ b/src/gpu-compute/lds_state.hh @@ -43,7 +43,6 @@ #include <utility> #include <vector> -#include "enums/MemOpType.hh" #include "enums/MemType.hh" #include "gpu-compute/misc.hh" #include "mem/mem_object.hh" diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc index e2238bf45..80dad6fcd 100644 --- a/src/gpu-compute/local_memory_pipeline.cc +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -62,7 +62,7 @@ LocalMemPipeline::exec() lmReturnedRequests.front() : nullptr; bool accessVrf = true; - if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) { + if ((m) && (m->isLoad() || m->isAtomicRet())) { Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; accessVrf = @@ -137,7 +137,7 @@ LocalMemPipeline::doSmReturn(GPUDynInstPtr m) Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; // Return data to registers - if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) { + if (m->isLoad() || m->isAtomicRet()) { std::vector<uint32_t> regVec; for (int k = 0; k < m->n_reg; ++k) { int dst = m->dst_reg+k; @@ -172,13 +172,12 @@ LocalMemPipeline::doSmReturn(GPUDynInstPtr m) // Decrement outstanding request count computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); - if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) - || MO_H(m->m_op)) { + if (m->isStore() || m->isAtomic()) { computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm, m->time, -1); } - if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + if (m->isLoad() || m->isAtomic()) { computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm, m->time, -1); } diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh index c1f741d6a..13afab977 100644 --- a/src/gpu-compute/shader.hh +++ b/src/gpu-compute/shader.hh @@ -47,7 +47,6 @@ #include "cpu/simple_thread.hh" #include "cpu/thread_context.hh" #include "cpu/thread_state.hh" -#include "enums/MemOpType.hh" #include "enums/MemType.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_tlb.hh" diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc index c43d765af..c50c06cc6 100644 --- a/src/gpu-compute/vector_register_file.cc +++ b/src/gpu-compute/vector_register_file.cc @@ -38,7 +38,6 @@ #include <string> #include "base/misc.hh" -#include "gpu-compute/code_enums.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/shader.hh" @@ -153,8 +152,8 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const void VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w) { - bool loadInstr = IS_OT_READ(ii->opType()); - bool atomicInstr = IS_OT_ATOMIC(ii->opType()); + bool loadInstr = ii->isLoad(); + bool atomicInstr = ii->isAtomic() || ii->isMemFence(); bool loadNoArgInstr = loadInstr && !ii->isArgLoad(); diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index c677cbe41..caeed85a7 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -37,7 +37,6 @@ #include "debug/GPUExec.hh" #include "debug/WavefrontStack.hh" -#include "gpu-compute/code_enums.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/shader.hh" @@ -165,19 +164,8 @@ Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr) bool Wavefront::isGmInstruction(GPUDynInstPtr ii) { - if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || - IS_OT_ATOMIC_PM(ii->opType())) { + if (ii->isGlobalMem() || ii->isFlat()) return true; - } - - if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || - IS_OT_ATOMIC_GM(ii->opType())) { - return true; - } - - if (IS_OT_FLAT(ii->opType())) { - return true; - } return false; } @@ -185,8 +173,7 @@ Wavefront::isGmInstruction(GPUDynInstPtr ii) bool Wavefront::isLmInstruction(GPUDynInstPtr ii) { - if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) || - IS_OT_ATOMIC_LM(ii->opType())) { + if (ii->isLocalMem()) { return true; } @@ -199,10 +186,9 @@ Wavefront::isOldestInstALU() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP || - ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || - ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || - ii->opType() == Enums::OT_KERN_READ)) { + if (status != S_STOPPED && (ii->isNop() || + ii->isReturn() || ii->isBranch() || + ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) { return true; } @@ -215,7 +201,7 @@ Wavefront::isOldestInstBarrier() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) { + if (status != S_STOPPED && ii->isBarrier()) { return true; } @@ -228,9 +214,7 @@ Wavefront::isOldestInstGMem() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) || - IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { - + if (status != S_STOPPED && ii->isGlobalMem()) { return true; } @@ -243,9 +227,7 @@ Wavefront::isOldestInstLMem() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) || - IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { - + if (status != S_STOPPED && ii->isLocalMem()) { return true; } @@ -258,9 +240,7 @@ Wavefront::isOldestInstPrivMem() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) || - IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { - + if (status != S_STOPPED && ii->isPrivateSeg()) { return true; } @@ -273,8 +253,7 @@ Wavefront::isOldestInstFlatMem() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) { - + if (status != S_STOPPED && ii->isFlat()) { return true; } @@ -289,7 +268,7 @@ Wavefront::instructionBufferHasBranch() for (auto it : instructionBuffer) { GPUDynInstPtr ii = it; - if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) { + if (ii->isReturn() || ii->isBranch()) { return true; } } @@ -371,23 +350,16 @@ Wavefront::ready(itype_e type) // checking readiness will be fixed eventually. In the meantime, let's // make sure that we do not silently let an instruction type slip // through this logic and always return not ready. - if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP || - ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || - ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || - ii->opType() == Enums::OT_KERN_READ || - ii->opType() == Enums::OT_ARG || - IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || - IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) || - IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || - IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || - IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) { + if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() || + ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() || + ii->isMemFence() || ii->isFlat())) { panic("next instruction: %s is of unknown type\n", ii->disassemble()); } DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n", computeUnit->cu_id, simdId, wfSlotId, ii->disassemble()); - if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) { + if (type == I_ALU && ii->isBarrier()) { // Here for ALU instruction (barrier) if (!computeUnit->wfWait[simdId].prerdy()) { // Is wave slot free? @@ -400,7 +372,7 @@ Wavefront::ready(itype_e type) } ready_inst = true; - } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) { + } else if (type == I_ALU && ii->isNop()) { // Here for ALU instruction (nop) if (!computeUnit->wfWait[simdId].prerdy()) { // Is wave slot free? @@ -408,7 +380,7 @@ Wavefront::ready(itype_e type) } ready_inst = true; - } else if (type == I_ALU && ii->opType() == Enums::OT_RET) { + } else if (type == I_ALU && ii->isReturn()) { // Here for ALU instruction (return) if (!computeUnit->wfWait[simdId].prerdy()) { // Is wave slot free? @@ -421,10 +393,10 @@ Wavefront::ready(itype_e type) } ready_inst = true; - } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH || - ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || - ii->opType() == Enums::OT_KERN_READ || - ii->opType() == Enums::OT_ARG)) { + } else if (type == I_ALU && (ii->isBranch() || + ii->isALU() || + (ii->isKernArgSeg() && ii->isLoad()) || + ii->isArgSeg())) { // Here for ALU instruction (all others) if (!computeUnit->wfWait[simdId].prerdy()) { // Is alu slot free? @@ -439,18 +411,16 @@ Wavefront::ready(itype_e type) return 0; } ready_inst = true; - } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) || - IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { + } else if (type == I_GLOBAL && ii->isGlobalMem()) { // Here Global memory instruction - if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) { + if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) { // Are there in pipe or outstanding global memory write requests? if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) { return 0; } } - if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) || - IS_OT_HIST_GM(ii->opType())) { + if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) { // Are there in pipe or outstanding global memory read requests? if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) return 0; @@ -480,17 +450,15 @@ Wavefront::ready(itype_e type) return 0; } ready_inst = true; - } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) || - IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { + } else if (type == I_SHARED && ii->isLocalMem()) { // Here for Shared memory instruction - if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) { + if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) { if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) { return 0; } } - if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || - IS_OT_HIST_LM(ii->opType())) { + if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) { if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) { return 0; } @@ -519,47 +487,7 @@ Wavefront::ready(itype_e type) return 0; } ready_inst = true; - } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) || - IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { - // Here for Private memory instruction ------------------------ // - if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) { - if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) { - return 0; - } - } - - if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) || - IS_OT_HIST_PM(ii->opType())) { - if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) { - return 0; - } - } - - if (!glbMemBusRdy) { - // Is there an available VRF->Global memory read bus? - return 0; - } - - if (!glbMemIssueRdy) { - // Is wave slot free? - return 0; - } - - if (!computeUnit->globalMemoryPipe. - isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { - // Can we insert a new request to the Global Mem Request FIFO? - return 0; - } - // can we schedule source & destination operands on the VRF? - if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, - VrfAccessType::RD_WR)) { - return 0; - } - if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { - return 0; - } - ready_inst = true; - } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) { + } else if (type == I_FLAT && ii->isFlat()) { if (!glbMemBusRdy) { // Is there an available VRF->Global memory read bus? return 0; @@ -618,23 +546,22 @@ Wavefront::updateResources() assert(ii); computeUnit->vrf[simdId]->updateResources(this, ii); // Single precision ALU or Branch or Return or Special instruction - if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || - ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || + if (ii->isALU() || ii->isSpecialOp() || + ii->isBranch() || // FIXME: Kernel argument loads are currently treated as ALU operations // since we don't send memory packets at execution. If we fix that then // we should map them to one of the memory pipelines - ii->opType()==Enums::OT_KERN_READ || - ii->opType()==Enums::OT_ARG || - ii->opType()==Enums::OT_RET) { + (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() || + ii->isReturn()) { computeUnit->aluPipe[simdId].preset(computeUnit->shader-> ticks(computeUnit->spBypassLength())); // this is to enforce a fixed number of cycles per issue slot per SIMD computeUnit->wfWait[simdId].preset(computeUnit->shader-> ticks(computeUnit->issuePeriod)); - } else if (ii->opType() == Enums::OT_BARRIER) { + } else if (ii->isBarrier()) { computeUnit->wfWait[simdId].preset(computeUnit->shader-> ticks(computeUnit->issuePeriod)); - } else if (ii->opType() == Enums::OT_FLAT_READ) { + } else if (ii->isLoad() && ii->isFlat()) { assert(Enums::SC_NONE != ii->executedAs()); memReqsInPipe++; rdGmReqsInPipe++; @@ -649,7 +576,7 @@ Wavefront::updateResources() computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); } - } else if (ii->opType() == Enums::OT_FLAT_WRITE) { + } else if (ii->isStore() && ii->isFlat()) { assert(Enums::SC_NONE != ii->executedAs()); memReqsInPipe++; wrGmReqsInPipe++; @@ -664,21 +591,21 @@ Wavefront::updateResources() computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); } - } else if (IS_OT_READ_GM(ii->opType())) { + } else if (ii->isLoad() && ii->isGlobalMem()) { memReqsInPipe++; rdGmReqsInPipe++; computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. preset(computeUnit->shader->ticks(4)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_GM(ii->opType())) { + } else if (ii->isStore() && ii->isGlobalMem()) { memReqsInPipe++; wrGmReqsInPipe++; computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. preset(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_GM(ii->opType())) { + } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) { memReqsInPipe++; wrGmReqsInPipe++; rdGmReqsInPipe++; @@ -686,21 +613,21 @@ Wavefront::updateResources() preset(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_READ_LM(ii->opType())) { + } else if (ii->isLoad() && ii->isLocalMem()) { memReqsInPipe++; rdLmReqsInPipe++; computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. preset(computeUnit->shader->ticks(4)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_LM(ii->opType())) { + } else if (ii->isStore() && ii->isLocalMem()) { memReqsInPipe++; wrLmReqsInPipe++; computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. preset(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_LM(ii->opType())) { + } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) { memReqsInPipe++; wrLmReqsInPipe++; rdLmReqsInPipe++; @@ -708,28 +635,6 @@ Wavefront::updateResources() preset(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_READ_PM(ii->opType())) { - memReqsInPipe++; - rdGmReqsInPipe++; - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - preset(computeUnit->shader->ticks(4)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_PM(ii->opType())) { - memReqsInPipe++; - wrGmReqsInPipe++; - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - preset(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_PM(ii->opType())) { - memReqsInPipe++; - wrGmReqsInPipe++; - rdGmReqsInPipe++; - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - preset(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); } } @@ -751,7 +656,7 @@ Wavefront::exec() DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, ii->disassemble(), old_pc); - ii->execute(); + ii->execute(ii); // access the VRF computeUnit->vrf[simdId]->exec(ii, this); srcRegOpDist.sample(ii->numSrcRegOperands()); @@ -785,24 +690,24 @@ Wavefront::exec() // ---- Update Vector ALU pipeline and other resources ------------------ // // Single precision ALU or Branch or Return or Special instruction - if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || - ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || + if (ii->isALU() || ii->isSpecialOp() || + ii->isBranch() || // FIXME: Kernel argument loads are currently treated as ALU operations // since we don't send memory packets at execution. If we fix that then // we should map them to one of the memory pipelines - ii->opType() == Enums::OT_KERN_READ || - ii->opType() == Enums::OT_ARG || - ii->opType() == Enums::OT_RET) { + (ii->isKernArgSeg() && ii->isLoad()) || + ii->isArgSeg() || + ii->isReturn()) { computeUnit->aluPipe[simdId].set(computeUnit->shader-> ticks(computeUnit->spBypassLength())); // this is to enforce a fixed number of cycles per issue slot per SIMD computeUnit->wfWait[simdId].set(computeUnit->shader-> ticks(computeUnit->issuePeriod)); - } else if (ii->opType() == Enums::OT_BARRIER) { + } else if (ii->isBarrier()) { computeUnit->wfWait[simdId].set(computeUnit->shader-> ticks(computeUnit->issuePeriod)); - } else if (ii->opType() == Enums::OT_FLAT_READ) { + } else if (ii->isLoad() && ii->isFlat()) { assert(Enums::SC_NONE != ii->executedAs()); if (Enums::SC_SHARED == ii->executedAs()) { @@ -816,7 +721,7 @@ Wavefront::exec() computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); } - } else if (ii->opType() == Enums::OT_FLAT_WRITE) { + } else if (ii->isStore() && ii->isFlat()) { assert(Enums::SC_NONE != ii->executedAs()); if (Enums::SC_SHARED == ii->executedAs()) { computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. @@ -829,32 +734,32 @@ Wavefront::exec() computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); } - } else if (IS_OT_READ_GM(ii->opType())) { + } else if (ii->isLoad() && ii->isGlobalMem()) { computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. set(computeUnit->shader->ticks(4)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_GM(ii->opType())) { + } else if (ii->isStore() && ii->isGlobalMem()) { computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. set(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_GM(ii->opType())) { + } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) { computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. set(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_READ_LM(ii->opType())) { + } else if (ii->isLoad() && ii->isLocalMem()) { computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. set(computeUnit->shader->ticks(4)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_LM(ii->opType())) { + } else if (ii->isStore() && ii->isLocalMem()) { computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. set(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_LM(ii->opType())) { + } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) { computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. set(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. |