diff options
Diffstat (limited to 'src/gpu-compute')
-rw-r--r-- | src/gpu-compute/GPU.py | 108 | ||||
-rw-r--r-- | src/gpu-compute/GPUStaticInstFlags.py | 111 | ||||
-rw-r--r-- | src/gpu-compute/SConscript | 1 | ||||
-rw-r--r-- | src/gpu-compute/code_enums.hh | 116 | ||||
-rw-r--r-- | src/gpu-compute/compute_unit.cc | 26 | ||||
-rw-r--r-- | src/gpu-compute/compute_unit.hh | 1 | ||||
-rw-r--r-- | src/gpu-compute/global_memory_pipeline.cc | 23 | ||||
-rw-r--r-- | src/gpu-compute/gpu_dyn_inst.cc | 382 | ||||
-rw-r--r-- | src/gpu-compute/gpu_dyn_inst.hh | 219 | ||||
-rw-r--r-- | src/gpu-compute/gpu_static_inst.cc | 6 | ||||
-rw-r--r-- | src/gpu-compute/gpu_static_inst.hh | 167 | ||||
-rw-r--r-- | src/gpu-compute/kernel_cfg.cc | 10 | ||||
-rw-r--r-- | src/gpu-compute/lds_state.cc | 7 | ||||
-rw-r--r-- | src/gpu-compute/lds_state.hh | 1 | ||||
-rw-r--r-- | src/gpu-compute/local_memory_pipeline.cc | 9 | ||||
-rw-r--r-- | src/gpu-compute/shader.hh | 1 | ||||
-rw-r--r-- | src/gpu-compute/vector_register_file.cc | 5 | ||||
-rw-r--r-- | src/gpu-compute/wavefront.cc | 207 |
18 files changed, 802 insertions, 598 deletions
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index f580a09f7..b672f616c 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -171,56 +171,6 @@ class GpuDispatcher(DmaDevice): cl_driver = Param.ClDriver('pointer to driver') -class OpType(Enum): vals = [ - 'OT_NULL', - 'OT_ALU', - 'OT_SPECIAL', - 'OT_GLOBAL_READ', - 'OT_GLOBAL_WRITE', - 'OT_GLOBAL_ATOMIC', - 'OT_GLOBAL_HIST', - 'OT_GLOBAL_LDAS', - 'OT_SHARED_READ', - 'OT_SHARED_WRITE', - 'OT_SHARED_ATOMIC', - 'OT_SHARED_HIST', - 'OT_SHARED_LDAS', - 'OT_PRIVATE_READ', - 'OT_PRIVATE_WRITE', - 'OT_PRIVATE_ATOMIC', - 'OT_PRIVATE_HIST', - 'OT_PRIVATE_LDAS', - 'OT_SPILL_READ', - 'OT_SPILL_WRITE', - 'OT_SPILL_ATOMIC', - 'OT_SPILL_HIST', - 'OT_SPILL_LDAS', - 'OT_READONLY_READ', - 'OT_READONLY_WRITE', - 'OT_READONLY_ATOMIC', - 'OT_READONLY_HIST', - 'OT_READONLY_LDAS', - 'OT_FLAT_READ', - 'OT_FLAT_WRITE', - 'OT_FLAT_ATOMIC', - 'OT_FLAT_HIST', - 'OT_FLAT_LDAS', - 'OT_KERN_READ', - 'OT_BRANCH', - - # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version - # of the compiler. - 'OT_SHARED_MEMFENCE', - 'OT_GLOBAL_MEMFENCE', - 'OT_BOTH_MEMFENCE', - - 'OT_BARRIER', - 'OT_PRINT', - 'OT_RET', - 'OT_NOP', - 'OT_ARG' - ] - class MemType(Enum): vals = [ 'M_U8', 'M_U16', @@ -235,47 +185,6 @@ class MemType(Enum): vals = [ 'M_F64', ] -class MemOpType(Enum): vals = [ - 'MO_LD', - 'MO_ST', - 'MO_LDAS', - 'MO_LDA', - 'MO_AAND', - 'MO_AOR', - 'MO_AXOR', - 'MO_ACAS', - 'MO_AEXCH', - 'MO_AADD', - 'MO_ASUB', - 'MO_AINC', - 'MO_ADEC', - 'MO_AMAX', - 'MO_AMIN', - 'MO_ANRAND', - 'MO_ANROR', - 'MO_ANRXOR', - 'MO_ANRCAS', - 'MO_ANREXCH', - 'MO_ANRADD', - 'MO_ANRSUB', - 'MO_ANRINC', - 'MO_ANRDEC', - 'MO_ANRMAX', - 'MO_ANRMIN', - 'MO_HAND', - 'MO_HOR', - 'MO_HXOR', - 'MO_HCAS', - 'MO_HEXCH', - 'MO_HADD', - 'MO_HSUB', - 'MO_HINC', - 'MO_HDEC', - 'MO_HMAX', - 'MO_HMIN', - 'MO_UNDEF' - ] - class StorageClassType(Enum): vals = [ 'SC_SPILL', 'SC_GLOBAL', @@ -293,20 +202,3 @@ class RegisterType(Enum): vals = [ 'RT_HARDWARE', 'RT_NONE', ] - -class GenericMemoryOrder(Enum): vals = [ - 'MEMORY_ORDER_NONE', - 'MEMORY_ORDER_RELAXED', - 'MEMORY_ORDER_SC_ACQUIRE', - 'MEMORY_ORDER_SC_RELEASE', - 'MEMORY_ORDER_SC_ACQUIRE_RELEASE', - ] - -class GenericMemoryScope(Enum): vals = [ - 'MEMORY_SCOPE_NONE', - 'MEMORY_SCOPE_WORKITEM', - 'MEMORY_SCOPE_WAVEFRONT', - 'MEMORY_SCOPE_WORKGROUP', - 'MEMORY_SCOPE_DEVICE', - 'MEMORY_SCOPE_SYSTEM', - ] diff --git a/src/gpu-compute/GPUStaticInstFlags.py b/src/gpu-compute/GPUStaticInstFlags.py new file mode 100644 index 000000000..453fdced2 --- /dev/null +++ b/src/gpu-compute/GPUStaticInstFlags.py @@ -0,0 +1,111 @@ +# Copyright (c) 2016 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Anthony Gutierrez + +from m5.params import * + +class GPUStaticInstFlags(Enum): + wrapper_name = 'GPUStaticInstFlags' + wrapper_is_struct = True + enum_name = 'Flags' + + vals = [ + # Op types + 'ALU', # ALU op + 'Branch', # Branch instruction + 'Nop', # No-op (no effect at all) + 'Return', # Return instruction + 'UnconditionalJump', # + 'SpecialOp', # Special op + 'Waitcnt', # Is a waitcnt instruction + + # Memory ops + 'MemBarrier', # Barrier instruction + 'MemFence', # Memory fence instruction + 'MemoryRef', # References memory (load, store, or atomic) + 'Flat', # Flat memory op + 'Load', # Reads from memory + 'Store', # Writes to memory + + # Atomic ops + 'AtomicReturn', # Atomic instruction that returns data + 'AtomicNoReturn', # Atomic instruction that doesn't return data + + # Instruction attributes + 'Scalar', # A scalar (not vector) operation + 'ReadsSCC', # The instruction reads SCC + 'WritesSCC', # The instruction writes SCC + 'ReadsVCC', # The instruction reads VCC + 'WritesVCC', # The instruction writes VCC + + # Atomic OP types + 'AtomicAnd', + 'AtomicOr', + 'AtomicXor', + 'AtomicCAS', + 'AtomicExch', + 'AtomicAdd', + 'AtomicSub', + 'AtomicInc', + 'AtomicDec', + 'AtomicMax', + 'AtomicMin', + + # Memory order flags + 'RelaxedOrder', + 'Acquire', # Has acquire semantics + 'Release', # Has release semantics + 'AcquireRelease', # Has acquire and release semantics + 'NoOrder', # Has no ordering restrictions + + # Segment access flags + 'ArgSegment', # Accesses the arg segment + 'GlobalSegment', # Accesses global memory + 'GroupSegment', # Accesses local memory (LDS), aka shared memory + 'KernArgSegment', # Accesses the kernel argument segment + 'PrivateSegment', # Accesses the private segment + 'ReadOnlySegment', # Accesses read only memory + 'SpillSegment', # Accesses the spill segment + 'NoSegment', # Does not have an associated segment + + # Scope flags + 'WorkitemScope', + 'WavefrontScope', + 'WorkgroupScope', + 'DeviceScope', + 'SystemScope', + 'NoScope', # Does not have an associated scope + + # Coherence flags + 'GloballyCoherent', # Coherent with other workitems on same device + 'SystemCoherent' # Coherent with a different device, or the host + ] diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript index 88c1cf036..8cf1ed8cf 100644 --- a/src/gpu-compute/SConscript +++ b/src/gpu-compute/SConscript @@ -41,6 +41,7 @@ if not env['BUILD_GPU']: Return() SimObject('GPU.py') +SimObject('GPUStaticInstFlags.py') SimObject('LdsState.py') SimObject('X86GPUTLB.py') diff --git a/src/gpu-compute/code_enums.hh b/src/gpu-compute/code_enums.hh deleted file mode 100644 index 6cd9bfe26..000000000 --- a/src/gpu-compute/code_enums.hh +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2015 Advanced Micro Devices, Inc. - * All rights reserved. - * - * For use for simulation and test purposes only - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * Author: Anthony Gutierrez - */ - -#ifndef __CODE_ENUMS_HH__ -#define __CODE_ENUMS_HH__ - -#define IS_OT_GLOBAL(a) ((a)>=Enums::OT_GLOBAL_READ \ - && (a)<=Enums::OT_GLOBAL_LDAS) -#define IS_OT_SHARED(a) ((a)>=Enums::OT_SHARED_READ \ - && (a)<=Enums::OT_SHARED_LDAS) -#define IS_OT_PRIVATE(a) ((a)>=Enums::OT_PRIVATE_READ \ - && (a)<=Enums::OT_PRIVATE_LDAS) -#define IS_OT_SPILL(a) ((a)>=Enums::OT_SPILL_READ \ - && (a)<=Enums::OT_SPILL_LDAS) -#define IS_OT_READONLY(a) ((a)>=Enums::OT_READONLY_READ \ - && (a)<=Enums::OT_READONLY_LDAS) -#define IS_OT_FLAT(a) ((a)>=Enums::OT_FLAT_READ && (a)<=Enums::OT_FLAT_LDAS) - -#define IS_OT_LDAS(a) ((a)==Enums::OT_GLOBAL_LDAS||(a)==Enums::OT_SHARED_LDAS \ - ||(a)==Enums::OT_PRIVATE_LDAS||(a)==Enums::OT_SPILL_LDAS \ - ||(a)==Enums::OT_READONLY_LDAS||(a)==Enums::OT_FLAT_LDAS) - -#define IS_OT_READ(a) ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SHARED_READ \ - ||(a)==Enums::OT_PRIVATE_READ||(a)==Enums::OT_SPILL_READ \ - ||(a)==Enums::OT_READONLY_READ||(a)==Enums::OT_FLAT_READ) - -#define IS_OT_READ_GM(a) \ - ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SPILL_READ \ - ||(a)==Enums::OT_READONLY_READ) - -#define IS_OT_READ_LM(a) ((a)==Enums::OT_SHARED_READ) - -#define IS_OT_READ_RM(a) ((a)==Enums::OT_READONLY_READ) - -#define IS_OT_READ_PM(a) ((a)==Enums::OT_PRIVATE_READ) - -#define IS_OT_WRITE(a) \ - ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SHARED_WRITE \ - ||(a)==Enums::OT_PRIVATE_WRITE||(a)==Enums::OT_SPILL_WRITE \ - ||(a)==Enums::OT_READONLY_WRITE||(a)==Enums::OT_FLAT_WRITE) - -#define IS_OT_WRITE_GM(a) \ - ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SPILL_WRITE \ - ||(a)==Enums::OT_READONLY_WRITE) - -#define IS_OT_WRITE_LM(a) ((a)==Enums::OT_SHARED_WRITE) - -#define IS_OT_WRITE_PM(a) ((a)==Enums::OT_PRIVATE_WRITE) - -#define IS_OT_ATOMIC(a) ((a)==Enums::OT_GLOBAL_ATOMIC \ - ||(a)==Enums::OT_SHARED_ATOMIC \ - ||(a)==Enums::OT_PRIVATE_ATOMIC \ - ||(a)==Enums::OT_SPILL_ATOMIC \ - ||(a)==Enums::OT_READONLY_ATOMIC \ - ||(a)==Enums::OT_BOTH_MEMFENCE \ - ||(a)==Enums::OT_FLAT_ATOMIC) - -#define IS_OT_ATOMIC_GM(a) ((a)==Enums::OT_GLOBAL_ATOMIC \ - ||(a)==Enums::OT_SPILL_ATOMIC \ - ||(a)==Enums::OT_READONLY_ATOMIC \ - ||(a)==Enums::OT_GLOBAL_MEMFENCE \ - ||(a)==Enums::OT_BOTH_MEMFENCE) - -#define IS_OT_ATOMIC_LM(a) ((a)==Enums::OT_SHARED_ATOMIC \ - ||(a)==Enums::OT_SHARED_MEMFENCE) - -#define IS_OT_ATOMIC_PM(a) ((a)==Enums::OT_PRIVATE_ATOMIC) - -#define IS_OT_HIST(a) ((a)==Enums::OT_GLOBAL_HIST \ - ||(a)==Enums::OT_SHARED_HIST \ - ||(a)==Enums::OT_PRIVATE_HIST \ - ||(a)==Enums::OT_SPILL_HIST \ - ||(a)==Enums::OT_READONLY_HIST \ - ||(a)==Enums::OT_FLAT_HIST) - -#define IS_OT_HIST_GM(a) ((a)==Enums::OT_GLOBAL_HIST \ - ||(a)==Enums::OT_SPILL_HIST \ - ||(a)==Enums::OT_READONLY_HIST) - -#define IS_OT_HIST_LM(a) ((a)==Enums::OT_SHARED_HIST) - -#define IS_OT_HIST_PM(a) ((a)==Enums::OT_PRIVATE_HIST) - -#endif // __CODE_ENUMS_HH__ diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 97e018713..abf8ff2c5 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -75,7 +75,8 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p), req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()), resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()), _masterId(p->system->getMasterId(name() + ".ComputeUnit")), - lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize) + lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize), + kernelLaunchInst(new KernelLaunchStaticInst()) { /** * This check is necessary because std::bitset only provides conversion @@ -316,13 +317,11 @@ ComputeUnit::StartWorkgroup(NDRange *ndr) // Send L1 cache acquire // isKernel + isAcquire = Kernel Begin if (shader->impl_kern_boundary_sync) { - GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(this, - nullptr, - nullptr, 0); + GPUDynInstPtr gpuDynInst = + std::make_shared<GPUDynInst>(this, nullptr, kernelLaunchInst, + getAndIncSeqNum()); gpuDynInst->useContinuation = false; - gpuDynInst->memoryOrder = Enums::MEMORY_ORDER_SC_ACQUIRE; - gpuDynInst->scope = Enums::MEMORY_SCOPE_SYSTEM; injectGlobalMemFence(gpuDynInst, true); } @@ -647,7 +646,7 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) gpuDynInst->wfSlotId, w->barrierCnt); if (gpuDynInst->useContinuation) { - assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + assert(!gpuDynInst->isNoScope()); gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), gpuDynInst); } @@ -658,7 +657,7 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) return true; } else if (pkt->req->isKernel() && pkt->req->isAcquire()) { if (gpuDynInst->useContinuation) { - assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + assert(!gpuDynInst->isNoScope()); gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), gpuDynInst); } @@ -942,6 +941,8 @@ void ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch, Request* req) { + assert(gpuDynInst->isGlobalSeg()); + if (!req) { req = new Request(0, 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId); } @@ -950,8 +951,6 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch, req->setFlags(Request::KERNEL); } - gpuDynInst->s_type = SEG_GLOBAL; - // for non-kernel MemFence operations, memorder flags are set depending // on which type of request is currently being sent, so this // should be set by the caller (e.g. if an inst has acq-rel @@ -1033,8 +1032,7 @@ ComputeUnit::DataPort::MemRespEvent::process() if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) gpuDynInst->statusVector.clear(); - if (gpuDynInst->m_op == Enums::MO_LD || MO_A(gpuDynInst->m_op) - || MO_ANR(gpuDynInst->m_op)) { + if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) { assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy()); compute_unit->globalMemoryPipe.getGMLdRespFIFO() @@ -1055,7 +1053,7 @@ ComputeUnit::DataPort::MemRespEvent::process() // the continuation may generate more work for // this memory request if (gpuDynInst->useContinuation) { - assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + assert(!gpuDynInst->isNoScope()); gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), gpuDynInst); } @@ -1065,7 +1063,7 @@ ComputeUnit::DataPort::MemRespEvent::process() gpuDynInst->statusBitVector = VectorMask(0); if (gpuDynInst->useContinuation) { - assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + assert(!gpuDynInst->isNoScope()); gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), gpuDynInst); } diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index a3547402a..938658fd1 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -744,6 +744,7 @@ class ComputeUnit : public MemObject private: uint64_t globalSeqNum; int wavefrontSize; + GPUStaticInst *kernelLaunchInst; }; #endif // __COMPUTE_UNIT_HH__ diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc index 102905ec8..ab3e8c47e 100644 --- a/src/gpu-compute/global_memory_pipeline.cc +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -67,7 +67,7 @@ GlobalMemPipeline::exec() bool accessVrf = true; // check the VRF to see if the operands of a load (or load component // of an atomic) are accessible - if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) { + if ((m) && (m->isLoad() || m->isAtomicRet())) { Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; accessVrf = @@ -127,10 +127,7 @@ GlobalMemPipeline::exec() // memory packets to DTLB if (!gmIssuedRequests.empty()) { GPUDynInstPtr mp = gmIssuedRequests.front(); - if (mp->m_op == Enums::MO_LD || - (mp->m_op >= Enums::MO_AAND && mp->m_op <= Enums::MO_AMIN) || - (mp->m_op >= Enums::MO_ANRAND && mp->m_op <= Enums::MO_ANRMIN)) { - + if (mp->isLoad() || mp->isAtomic()) { if (inflightLoads >= gmQueueSize) { return; } else { @@ -139,7 +136,7 @@ GlobalMemPipeline::exec() } else { if (inflightStores >= gmQueueSize) { return; - } else if (mp->m_op == Enums::MO_ST) { + } else if (mp->isStore()) { ++inflightStores; } } @@ -147,9 +144,8 @@ GlobalMemPipeline::exec() mp->initiateAcc(mp); gmIssuedRequests.pop(); - DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = %s\n", - computeUnit->cu_id, mp->simdId, mp->wfSlotId, - Enums::MemOpTypeStrings[mp->m_op]); + DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n", + computeUnit->cu_id, mp->simdId, mp->wfSlotId); } } @@ -160,12 +156,12 @@ GlobalMemPipeline::doGmReturn(GPUDynInstPtr m) Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; // Return data to registers - if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + if (m->isLoad() || m->isAtomic()) { gmReturnedLoads.pop(); assert(inflightLoads > 0); --inflightLoads; - if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) { + if (m->isLoad() || m->isAtomicRet()) { std::vector<uint32_t> regVec; // iterate over number of destination register operands since // this is a load or atomic operation @@ -214,13 +210,12 @@ GlobalMemPipeline::doGmReturn(GPUDynInstPtr m) // Decrement outstanding register count computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); - if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) || - MO_H(m->m_op)) { + if (m->isStore() || m->isAtomic()) { computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm, m->time, -1); } - if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + if (m->isLoad() || m->isAtomic()) { computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm, m->time, -1); } diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 1806e79e4..ec6340360 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -41,11 +41,10 @@ #include "gpu-compute/wavefront.hh" GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, - GPUStaticInst *_staticInst, uint64_t instSeqNum) + GPUStaticInst *static_inst, uint64_t instSeqNum) : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0), - m_op(Enums::MO_UNDEF), - memoryOrder(Enums::MEMORY_ORDER_NONE), n_reg(0), useContinuation(false), - statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum) + n_reg(0), useContinuation(false), + statusBitVector(0), _staticInst(static_inst), _seqNum(instSeqNum) { tlbHitLevel.assign(computeUnit()->wfSize(), -1); d_data = new uint8_t[computeUnit()->wfSize() * 16]; @@ -68,77 +67,69 @@ GPUDynInst::~GPUDynInst() } void -GPUDynInst::execute() +GPUDynInst::execute(GPUDynInstPtr gpuDynInst) { - GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(cu, wf, staticInst, - _seqNum); - staticInst->execute(gpuDynInst); + _staticInst->execute(gpuDynInst); } int GPUDynInst::numSrcRegOperands() { - return staticInst->numSrcRegOperands(); + return _staticInst->numSrcRegOperands(); } int GPUDynInst::numDstRegOperands() { - return staticInst->numDstRegOperands(); + return _staticInst->numDstRegOperands(); } int GPUDynInst::getNumOperands() { - return staticInst->getNumOperands(); + return _staticInst->getNumOperands(); } bool GPUDynInst::isVectorRegister(int operandIdx) { - return staticInst->isVectorRegister(operandIdx); + return _staticInst->isVectorRegister(operandIdx); } bool GPUDynInst::isScalarRegister(int operandIdx) { - return staticInst->isScalarRegister(operandIdx); + return _staticInst->isScalarRegister(operandIdx); } int GPUDynInst::getRegisterIndex(int operandIdx) { - return staticInst->getRegisterIndex(operandIdx); + return _staticInst->getRegisterIndex(operandIdx); } int GPUDynInst::getOperandSize(int operandIdx) { - return staticInst->getOperandSize(operandIdx); + return _staticInst->getOperandSize(operandIdx); } bool GPUDynInst::isDstOperand(int operandIdx) { - return staticInst->isDstOperand(operandIdx); + return _staticInst->isDstOperand(operandIdx); } bool GPUDynInst::isSrcOperand(int operandIdx) { - return staticInst->isSrcOperand(operandIdx); -} - -bool -GPUDynInst::isArgLoad() -{ - return staticInst->isArgLoad(); + return _staticInst->isSrcOperand(operandIdx); } const std::string& GPUDynInst::disassemble() const { - return staticInst->disassemble(); + return _staticInst->disassemble(); } uint64_t @@ -147,16 +138,10 @@ GPUDynInst::seqNum() const return _seqNum; } -Enums::OpType -GPUDynInst::opType() -{ - return staticInst->o_type; -} - Enums::StorageClassType GPUDynInst::executedAs() { - return staticInst->executed_as; + return _staticInst->executed_as; } // Process a memory instruction and (if necessary) submit timing request @@ -166,20 +151,347 @@ GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst) DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n", cu->cu_id, simdId, wfSlotId, exec_mask); - staticInst->initiateAcc(gpuDynInst); + _staticInst->initiateAcc(gpuDynInst); time = 0; } +/** + * accessor methods for the attributes of + * the underlying GPU static instruction + */ +bool +GPUDynInst::isALU() const +{ + return _staticInst->isALU(); +} + +bool +GPUDynInst::isBranch() const +{ + return _staticInst->isBranch(); +} + +bool +GPUDynInst::isNop() const +{ + return _staticInst->isNop(); +} + +bool +GPUDynInst::isReturn() const +{ + return _staticInst->isReturn(); +} + +bool +GPUDynInst::isUnconditionalJump() const +{ + return _staticInst->isUnconditionalJump(); +} + +bool +GPUDynInst::isSpecialOp() const +{ + return _staticInst->isSpecialOp(); +} + +bool +GPUDynInst::isWaitcnt() const +{ + return _staticInst->isWaitcnt(); +} + +bool +GPUDynInst::isBarrier() const +{ + return _staticInst->isBarrier(); +} + +bool +GPUDynInst::isMemFence() const +{ + return _staticInst->isMemFence(); +} + +bool +GPUDynInst::isMemRef() const +{ + return _staticInst->isMemRef(); +} + +bool +GPUDynInst::isFlat() const +{ + return _staticInst->isFlat(); +} + +bool +GPUDynInst::isLoad() const +{ + return _staticInst->isLoad(); +} + +bool +GPUDynInst::isStore() const +{ + return _staticInst->isStore(); +} + +bool +GPUDynInst::isAtomic() const +{ + return _staticInst->isAtomic(); +} + +bool +GPUDynInst::isAtomicNoRet() const +{ + return _staticInst->isAtomicNoRet(); +} + +bool +GPUDynInst::isAtomicRet() const +{ + return _staticInst->isAtomicRet(); +} + +bool +GPUDynInst::isScalar() const +{ + return _staticInst->isScalar(); +} + +bool +GPUDynInst::readsSCC() const +{ + return _staticInst->readsSCC(); +} + +bool +GPUDynInst::writesSCC() const +{ + return _staticInst->writesSCC(); +} + +bool +GPUDynInst::readsVCC() const +{ + return _staticInst->readsVCC(); +} + +bool +GPUDynInst::writesVCC() const +{ + return _staticInst->writesVCC(); +} + +bool +GPUDynInst::isAtomicAnd() const +{ + return _staticInst->isAtomicAnd(); +} + +bool +GPUDynInst::isAtomicOr() const +{ + return _staticInst->isAtomicOr(); +} + +bool +GPUDynInst::isAtomicXor() const +{ + return _staticInst->isAtomicXor(); +} + +bool +GPUDynInst::isAtomicCAS() const +{ + return _staticInst->isAtomicCAS(); +} + +bool GPUDynInst::isAtomicExch() const +{ + return _staticInst->isAtomicExch(); +} + +bool +GPUDynInst::isAtomicAdd() const +{ + return _staticInst->isAtomicAdd(); +} + +bool +GPUDynInst::isAtomicSub() const +{ + return _staticInst->isAtomicSub(); +} + +bool +GPUDynInst::isAtomicInc() const +{ + return _staticInst->isAtomicInc(); +} + +bool +GPUDynInst::isAtomicDec() const +{ + return _staticInst->isAtomicDec(); +} + +bool +GPUDynInst::isAtomicMax() const +{ + return _staticInst->isAtomicMax(); +} + +bool +GPUDynInst::isAtomicMin() const +{ + return _staticInst->isAtomicMin(); +} + +bool +GPUDynInst::isArgLoad() const +{ + return _staticInst->isArgLoad(); +} + +bool +GPUDynInst::isGlobalMem() const +{ + return _staticInst->isGlobalMem(); +} + +bool +GPUDynInst::isLocalMem() const +{ + return _staticInst->isLocalMem(); +} + +bool +GPUDynInst::isArgSeg() const +{ + return _staticInst->isArgSeg(); +} + +bool +GPUDynInst::isGlobalSeg() const +{ + return _staticInst->isGlobalSeg(); +} + +bool +GPUDynInst::isGroupSeg() const +{ + return _staticInst->isGroupSeg(); +} + +bool +GPUDynInst::isKernArgSeg() const +{ + return _staticInst->isKernArgSeg(); +} + +bool +GPUDynInst::isPrivateSeg() const +{ + return _staticInst->isPrivateSeg(); +} + +bool +GPUDynInst::isReadOnlySeg() const +{ + return _staticInst->isReadOnlySeg(); +} + +bool +GPUDynInst::isSpillSeg() const +{ + return _staticInst->isSpillSeg(); +} + +bool +GPUDynInst::isWorkitemScope() const +{ + return _staticInst->isWorkitemScope(); +} + +bool +GPUDynInst::isWavefrontScope() const +{ + return _staticInst->isWavefrontScope(); +} + +bool +GPUDynInst::isWorkgroupScope() const +{ + return _staticInst->isWorkgroupScope(); +} + +bool +GPUDynInst::isDeviceScope() const +{ + return _staticInst->isDeviceScope(); +} + +bool +GPUDynInst::isSystemScope() const +{ + return _staticInst->isSystemScope(); +} + +bool +GPUDynInst::isNoScope() const +{ + return _staticInst->isNoScope(); +} + +bool +GPUDynInst::isRelaxedOrder() const +{ + return _staticInst->isRelaxedOrder(); +} + +bool +GPUDynInst::isAcquire() const +{ + return _staticInst->isAcquire(); +} + +bool +GPUDynInst::isRelease() const +{ + return _staticInst->isRelease(); +} + +bool +GPUDynInst::isAcquireRelease() const +{ + return _staticInst->isAcquireRelease(); +} + +bool +GPUDynInst::isNoOrder() const +{ + return _staticInst->isNoOrder(); +} + +bool +GPUDynInst::isGloballyCoherent() const +{ + return _staticInst->isGloballyCoherent(); +} + bool -GPUDynInst::scalarOp() const +GPUDynInst::isSystemCoherent() const { - return staticInst->scalarOp(); + return _staticInst->isSystemCoherent(); } void GPUDynInst::updateStats() { - if (staticInst->isLocalMem()) { + if (_staticInst->isLocalMem()) { // access to LDS (shared) memory cu->dynamicLMemInstrCnt++; } else { diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh index 46774d867..c07d85d78 100644 --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -39,11 +39,7 @@ #include <cstdint> #include <string> -#include "enums/GenericMemoryOrder.hh" -#include "enums/GenericMemoryScope.hh" -#include "enums/MemOpType.hh" #include "enums/MemType.hh" -#include "enums/OpType.hh" #include "enums/StorageClassType.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_exec_context.hh" @@ -180,33 +176,19 @@ class AtomicOpMin : public TypedAtomicOpFunctor<T> } }; -#define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN) -#define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN) -#define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN) - typedef enum { VT_32, VT_64, } vgpr_type; -typedef enum -{ - SEG_PRIVATE, - SEG_SPILL, - SEG_GLOBAL, - SEG_SHARED, - SEG_READONLY, - SEG_FLAT -} seg_type; - class GPUDynInst : public GPUExecContext { public: - GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst, + GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst, uint64_t instSeqNum); ~GPUDynInst(); - void execute(); + void execute(GPUDynInstPtr gpuDynInst); int numSrcRegOperands(); int numDstRegOperands(); int getNumOperands(); @@ -216,13 +198,11 @@ class GPUDynInst : public GPUExecContext int getOperandSize(int operandIdx); bool isDstOperand(int operandIdx); bool isSrcOperand(int operandIdx); - bool isArgLoad(); const std::string &disassemble() const; uint64_t seqNum() const; - Enums::OpType opType(); Enums::StorageClassType executedAs(); // The address of the memory operation @@ -240,14 +220,7 @@ class GPUDynInst : public GPUExecContext // The memory type (M_U32, M_S32, ...) Enums::MemType m_type; - // The memory operation (MO_LD, MO_ST, ...) - Enums::MemOpType m_op; - Enums::GenericMemoryOrder memoryOrder; - - // Scope of the request - Enums::GenericMemoryScope scope; - // The memory segment (SEG_SHARED, SEG_GLOBAL, ...) - seg_type s_type; + // The equivalency class int equiv; // The return VGPR type (VT_32 or VT_64) @@ -288,10 +261,72 @@ class GPUDynInst : public GPUExecContext void updateStats(); - GPUStaticInst* staticInstruction() { return staticInst; } - - // Is the instruction a scalar or vector op? - bool scalarOp() const; + GPUStaticInst* staticInstruction() { return _staticInst; } + + bool isALU() const; + bool isBranch() const; + bool isNop() const; + bool isReturn() const; + bool isUnconditionalJump() const; + bool isSpecialOp() const; + bool isWaitcnt() const; + + bool isBarrier() const; + bool isMemFence() const; + bool isMemRef() const; + bool isFlat() const; + bool isLoad() const; + bool isStore() const; + + bool isAtomic() const; + bool isAtomicNoRet() const; + bool isAtomicRet() const; + + bool isScalar() const; + bool readsSCC() const; + bool writesSCC() const; + bool readsVCC() const; + bool writesVCC() const; + + bool isAtomicAnd() const; + bool isAtomicOr() const; + bool isAtomicXor() const; + bool isAtomicCAS() const; + bool isAtomicExch() const; + bool isAtomicAdd() const; + bool isAtomicSub() const; + bool isAtomicInc() const; + bool isAtomicDec() const; + bool isAtomicMax() const; + bool isAtomicMin() const; + + bool isArgLoad() const; + bool isGlobalMem() const; + bool isLocalMem() const; + + bool isArgSeg() const; + bool isGlobalSeg() const; + bool isGroupSeg() const; + bool isKernArgSeg() const; + bool isPrivateSeg() const; + bool isReadOnlySeg() const; + bool isSpillSeg() const; + + bool isWorkitemScope() const; + bool isWavefrontScope() const; + bool isWorkgroupScope() const; + bool isDeviceScope() const; + bool isSystemScope() const; + bool isNoScope() const; + + bool isRelaxedOrder() const; + bool isAcquire() const; + bool isRelease() const; + bool isAcquireRelease() const; + bool isNoOrder() const; + + bool isGloballyCoherent() const; + bool isSystemCoherent() const; /* * Loads/stores/atomics may have acquire/release semantics associated @@ -312,46 +347,32 @@ class GPUDynInst : public GPUExecContext bool useContinuation; template<typename c0> AtomicOpFunctor* - makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op) + makeAtomicOpFunctor(c0 *reg0, c0 *reg1) { - using namespace Enums; - - switch(op) { - case MO_AAND: - case MO_ANRAND: + if (isAtomicAnd()) { return new AtomicOpAnd<c0>(*reg0); - case MO_AOR: - case MO_ANROR: + } else if (isAtomicOr()) { return new AtomicOpOr<c0>(*reg0); - case MO_AXOR: - case MO_ANRXOR: + } else if (isAtomicXor()) { return new AtomicOpXor<c0>(*reg0); - case MO_ACAS: - case MO_ANRCAS: + } else if (isAtomicCAS()) { return new AtomicOpCAS<c0>(*reg0, *reg1, cu); - case MO_AEXCH: - case MO_ANREXCH: + } else if (isAtomicExch()) { return new AtomicOpExch<c0>(*reg0); - case MO_AADD: - case MO_ANRADD: + } else if (isAtomicAdd()) { return new AtomicOpAdd<c0>(*reg0); - case MO_ASUB: - case MO_ANRSUB: + } else if (isAtomicSub()) { return new AtomicOpSub<c0>(*reg0); - case MO_AINC: - case MO_ANRINC: + } else if (isAtomicInc()) { return new AtomicOpInc<c0>(); - case MO_ADEC: - case MO_ANRDEC: + } else if (isAtomicDec()) { return new AtomicOpDec<c0>(); - case MO_AMAX: - case MO_ANRMAX: + } else if (isAtomicMax()) { return new AtomicOpMax<c0>(*reg0); - case MO_AMIN: - case MO_ANRMIN: + } else if (isAtomicMin()) { return new AtomicOpMin<c0>(*reg0); - default: - panic("Unrecognized atomic operation"); + } else { + fatal("Unrecognized atomic operation"); } } @@ -359,88 +380,58 @@ class GPUDynInst : public GPUExecContext setRequestFlags(Request *req, bool setMemOrder=true) { // currently these are the easy scopes to deduce - switch (s_type) { - case SEG_PRIVATE: + if (isPrivateSeg()) { req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT); - break; - case SEG_SPILL: + } else if (isSpillSeg()) { req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT); - break; - case SEG_GLOBAL: + } else if (isGlobalSeg()) { req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT); - break; - case SEG_READONLY: + } else if (isReadOnlySeg()) { req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT); - break; - case SEG_SHARED: + } else if (isGroupSeg()) { req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT); - break; - case SEG_FLAT: + } else if (isFlat()) { // TODO: translate to correct scope assert(false); - default: - panic("Bad segment type"); - break; + } else { + fatal("%s has bad segment type\n", disassemble()); } - switch (scope) { - case Enums::MEMORY_SCOPE_NONE: - case Enums::MEMORY_SCOPE_WORKITEM: - break; - case Enums::MEMORY_SCOPE_WAVEFRONT: + if (isWavefrontScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::WAVEFRONT_SCOPE); - break; - case Enums::MEMORY_SCOPE_WORKGROUP: + } else if (isWorkgroupScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::WORKGROUP_SCOPE); - break; - case Enums::MEMORY_SCOPE_DEVICE: + } else if (isDeviceScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::DEVICE_SCOPE); - break; - case Enums::MEMORY_SCOPE_SYSTEM: + } else if (isSystemScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::SYSTEM_SCOPE); - break; - default: - panic("Bad scope type"); - break; + } else if (!isNoScope() && !isWorkitemScope()) { + fatal("%s has bad scope type\n", disassemble()); } if (setMemOrder) { // set acquire and release flags - switch (memoryOrder){ - case Enums::MEMORY_ORDER_SC_ACQUIRE: + if (isAcquire()) { req->setFlags(Request::ACQUIRE); - break; - case Enums::MEMORY_ORDER_SC_RELEASE: + } else if (isRelease()) { req->setFlags(Request::RELEASE); - break; - case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE: + } else if (isAcquireRelease()) { req->setFlags(Request::ACQUIRE | Request::RELEASE); - break; - default: - break; + } else if (!isNoOrder()) { + fatal("%s has bad memory order\n", disassemble()); } } // set atomic type // currently, the instruction genenerator only produces atomic return // but a magic instruction can produce atomic no return - if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB || - m_op == Enums::MO_AAND || m_op == Enums::MO_AOR || - m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX || - m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC || - m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH || - m_op == Enums::MO_ACAS) { + if (isAtomicRet()) { req->setFlags(Request::ATOMIC_RETURN_OP); - } else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB || - m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR || - m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX || - m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC || - m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH || - m_op == Enums::MO_ANRCAS) { + } else if (isAtomicNoRet()) { req->setFlags(Request::ATOMIC_NO_RETURN_OP); } } @@ -457,7 +448,7 @@ class GPUDynInst : public GPUExecContext std::vector<int> tlbHitLevel; private: - GPUStaticInst *staticInst; + GPUStaticInst *_staticInst; uint64_t _seqNum; }; diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc index 83b429e62..0f74bd532 100644 --- a/src/gpu-compute/gpu_static_inst.cc +++ b/src/gpu-compute/gpu_static_inst.cc @@ -36,10 +36,12 @@ #include "gpu-compute/gpu_static_inst.hh" GPUStaticInst::GPUStaticInst(const std::string &opcode) - : o_type(Enums::OT_ALU), executed_as(Enums::SC_NONE), opcode(opcode), - _instNum(0), _scalarOp(false) + : executed_as(Enums::SC_NONE), opcode(opcode), + _instNum(0) { + setFlag(NoOrder); } + const std::string& GPUStaticInst::disassemble() { diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh index 911e4f308..a73ec12e3 100644 --- a/src/gpu-compute/gpu_static_inst.hh +++ b/src/gpu-compute/gpu_static_inst.hh @@ -48,7 +48,7 @@ #include <cstdint> #include <string> -#include "enums/OpType.hh" +#include "enums/GPUStaticInstFlags.hh" #include "enums/StorageClassType.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/misc.hh" @@ -57,7 +57,7 @@ class BaseOperand; class BaseRegOperand; class Wavefront; -class GPUStaticInst +class GPUStaticInst : public GPUStaticInstFlags { public: GPUStaticInst(const std::string &opcode); @@ -86,22 +86,110 @@ class GPUStaticInst virtual bool isValid() const = 0; - /* - * Most instructions (including all HSAIL instructions) - * are vector ops, so _scalarOp will be false by default. - * Derived instruction objects that are scalar ops must - * set _scalarOp to true in their constructors. - */ - bool scalarOp() const { return _scalarOp; } + bool isALU() const { return _flags[ALU]; } + bool isBranch() const { return _flags[Branch]; } + bool isNop() const { return _flags[Nop]; } + bool isReturn() const { return _flags[Return]; } + + bool + isUnconditionalJump() const + { + return _flags[UnconditionalJump]; + } + + bool isSpecialOp() const { return _flags[SpecialOp]; } + bool isWaitcnt() const { return _flags[Waitcnt]; } + + bool isBarrier() const { return _flags[MemBarrier]; } + bool isMemFence() const { return _flags[MemFence]; } + bool isMemRef() const { return _flags[MemoryRef]; } + bool isFlat() const { return _flags[Flat]; } + bool isLoad() const { return _flags[Load]; } + bool isStore() const { return _flags[Store]; } + + bool + isAtomic() const + { + return _flags[AtomicReturn] || _flags[AtomicNoReturn]; + } + + bool isAtomicNoRet() const { return _flags[AtomicNoReturn]; } + bool isAtomicRet() const { return _flags[AtomicReturn]; } + + bool isScalar() const { return _flags[Scalar]; } + bool readsSCC() const { return _flags[ReadsSCC]; } + bool writesSCC() const { return _flags[WritesSCC]; } + bool readsVCC() const { return _flags[ReadsVCC]; } + bool writesVCC() const { return _flags[WritesVCC]; } - virtual bool isLocalMem() const + bool isAtomicAnd() const { return _flags[AtomicAnd]; } + bool isAtomicOr() const { return _flags[AtomicOr]; } + bool isAtomicXor() const { return _flags[AtomicXor]; } + bool isAtomicCAS() const { return _flags[AtomicCAS]; } + bool isAtomicExch() const { return _flags[AtomicExch]; } + bool isAtomicAdd() const { return _flags[AtomicAdd]; } + bool isAtomicSub() const { return _flags[AtomicSub]; } + bool isAtomicInc() const { return _flags[AtomicInc]; } + bool isAtomicDec() const { return _flags[AtomicDec]; } + bool isAtomicMax() const { return _flags[AtomicMax]; } + bool isAtomicMin() const { return _flags[AtomicMin]; } + + bool + isArgLoad() const + { + return (_flags[KernArgSegment] || _flags[ArgSegment]) && _flags[Load]; + } + + bool + isGlobalMem() const { - fatal("calling isLocalMem() on non-memory instruction.\n"); + return _flags[MemoryRef] && (_flags[GlobalSegment] || + _flags[PrivateSegment] || _flags[ReadOnlySegment] || + _flags[SpillSegment]); + } - return false; + bool + isLocalMem() const + { + return _flags[MemoryRef] && _flags[GroupSegment]; } - bool isArgLoad() { return false; } + bool isArgSeg() const { return _flags[ArgSegment]; } + bool isGlobalSeg() const { return _flags[GlobalSegment]; } + bool isGroupSeg() const { return _flags[GroupSegment]; } + bool isKernArgSeg() const { return _flags[KernArgSegment]; } + bool isPrivateSeg() const { return _flags[PrivateSegment]; } + bool isReadOnlySeg() const { return _flags[ReadOnlySegment]; } + bool isSpillSeg() const { return _flags[SpillSegment]; } + + bool isWorkitemScope() const { return _flags[WorkitemScope]; } + bool isWavefrontScope() const { return _flags[WavefrontScope]; } + bool isWorkgroupScope() const { return _flags[WorkgroupScope]; } + bool isDeviceScope() const { return _flags[DeviceScope]; } + bool isSystemScope() const { return _flags[SystemScope]; } + bool isNoScope() const { return _flags[NoScope]; } + + bool isRelaxedOrder() const { return _flags[RelaxedOrder]; } + bool isAcquire() const { return _flags[Acquire]; } + bool isRelease() const { return _flags[Release]; } + bool isAcquireRelease() const { return _flags[AcquireRelease]; } + bool isNoOrder() const { return _flags[NoOrder]; } + + /** + * Coherence domain of a memory instruction. Only valid for + * machine ISA. The coherence domain specifies where it is + * possible to perform memory synchronization, e.g., acquire + * or release, from the shader kernel. + * + * isGloballyCoherent(): returns true if kernel is sharing memory + * with other work-items on the same device (GPU) + * + * isSystemCoherent(): returns true if kernel is sharing memory + * with other work-items on a different device (GPU) or the host (CPU) + */ + bool isGloballyCoherent() const { return _flags[GloballyCoherent]; } + bool isSystemCoherent() const { return _flags[SystemCoherent]; } + virtual uint32_t instSize() = 0; // only used for memory instructions @@ -120,22 +208,13 @@ class GPUStaticInst virtual uint32_t getTargetPc() { return 0; } - /** - * Query whether the instruction is an unconditional jump i.e., the jump - * is always executed because there is no condition to be evaluated. - * - * If the instruction is not of branch type, the result is always false. - * - * @return True if the instruction is an unconditional jump. - */ - virtual bool unconditionalJumpInstruction() { return false; } - static uint64_t dynamic_id_count; - Enums::OpType o_type; // For flat memory accesses Enums::StorageClassType executed_as; + void setFlag(Flags flag) { _flags[flag] = true; } + protected: virtual void execLdAcq(GPUDynInstPtr gpuDynInst) @@ -169,7 +248,45 @@ class GPUStaticInst */ int _ipdInstNum; - bool _scalarOp; + std::bitset<Num_Flags> _flags; +}; + +class KernelLaunchStaticInst : public GPUStaticInst +{ + public: + KernelLaunchStaticInst() : GPUStaticInst("kernel_launch") + { + setFlag(Nop); + setFlag(Scalar); + setFlag(Acquire); + setFlag(SystemScope); + setFlag(GlobalSegment); + } + + void + execute(GPUDynInstPtr gpuDynInst) + { + fatal("kernel launch instruction should not be executed\n"); + } + + void + generateDisassembly() + { + disassembly = opcode; + } + + int getNumOperands() { return 0; } + bool isCondRegister(int operandIndex) { return false; } + bool isScalarRegister(int operandIndex) { return false; } + bool isVectorRegister(int operandIndex) { return false; } + bool isSrcOperand(int operandIndex) { return false; } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) { return 0; } + int getRegisterIndex(int operandIndex) { return 0; } + int numDstRegOperands() { return 0; } + int numSrcRegOperands() { return 0; } + bool isValid() const { return true; } + uint32_t instSize() { return 0; } }; #endif // __GPU_STATIC_INST_HH__ diff --git a/src/gpu-compute/kernel_cfg.cc b/src/gpu-compute/kernel_cfg.cc index 10ded11b7..ac6a81b16 100644 --- a/src/gpu-compute/kernel_cfg.cc +++ b/src/gpu-compute/kernel_cfg.cc @@ -104,7 +104,7 @@ ControlFlowInfo::createBasicBlocks() leaders.insert(0); for (int i = 1; i < instructions.size(); i++) { GPUStaticInst* instruction = instructions[i]; - if (instruction->o_type == Enums::OT_BRANCH) { + if (instruction->isBranch()) { const int target_pc = instruction->getTargetPc(); leaders.insert(target_pc); leaders.insert(i + 1); @@ -137,18 +137,18 @@ ControlFlowInfo::connectBasicBlocks() break; } GPUStaticInst* last = lastInstruction(bb.get()); - if (last->o_type == Enums::OT_RET) { + if (last->isReturn()) { bb->successorIds.insert(exit_bb->id); continue; } - if (last->o_type == Enums::OT_BRANCH) { + if (last->isBranch()) { const uint32_t target_pc = last->getTargetPc(); BasicBlock* target_bb = basicBlock(target_pc); bb->successorIds.insert(target_bb->id); } // Unconditional jump instructions have a unique successor - if (!last->unconditionalJumpInstruction()) { + if (!last->isUnconditionalJump()) { BasicBlock* next_bb = basicBlock(last->instNum() + 1); bb->successorIds.insert(next_bb->id); } @@ -274,7 +274,7 @@ ControlFlowInfo::printBasicBlocks() const int inst_num = inst->instNum(); std::cout << inst_num << " [" << basicBlock(inst_num)->id << "]: " << inst->disassemble(); - if (inst->o_type == Enums::OT_BRANCH) { + if (inst->isBranch()) { std::cout << ", PC = " << inst->getTargetPc(); } std::cout << std::endl; diff --git a/src/gpu-compute/lds_state.cc b/src/gpu-compute/lds_state.cc index d4a27318a..fad98c886 100644 --- a/src/gpu-compute/lds_state.cc +++ b/src/gpu-compute/lds_state.cc @@ -141,8 +141,7 @@ LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst, } } - if (gpuDynInst->m_op == Enums::MO_LD || - gpuDynInst->m_op == Enums::MO_ST) { + if (gpuDynInst->isLoad() || gpuDynInst->isStore()) { // mask identical addresses for (int j = 0; j < numBanks; ++j) { for (int j0 = 0; j0 < j; j0++) { @@ -208,8 +207,8 @@ LdsState::processPacket(PacketPtr packet) GPUDynInstPtr dynInst = getDynInstr(packet); // account for the LDS bank conflict overhead - int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() : - (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() : + int busLength = (dynInst->isLoad()) ? parent->loadBusLength() : + (dynInst->isStore()) ? parent->storeBusLength() : parent->loadBusLength(); // delay for accessing the LDS Tick processingTime = diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh index 58d109493..5fcbe82c0 100644 --- a/src/gpu-compute/lds_state.hh +++ b/src/gpu-compute/lds_state.hh @@ -43,7 +43,6 @@ #include <utility> #include <vector> -#include "enums/MemOpType.hh" #include "enums/MemType.hh" #include "gpu-compute/misc.hh" #include "mem/mem_object.hh" diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc index e2238bf45..80dad6fcd 100644 --- a/src/gpu-compute/local_memory_pipeline.cc +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -62,7 +62,7 @@ LocalMemPipeline::exec() lmReturnedRequests.front() : nullptr; bool accessVrf = true; - if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) { + if ((m) && (m->isLoad() || m->isAtomicRet())) { Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; accessVrf = @@ -137,7 +137,7 @@ LocalMemPipeline::doSmReturn(GPUDynInstPtr m) Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; // Return data to registers - if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) { + if (m->isLoad() || m->isAtomicRet()) { std::vector<uint32_t> regVec; for (int k = 0; k < m->n_reg; ++k) { int dst = m->dst_reg+k; @@ -172,13 +172,12 @@ LocalMemPipeline::doSmReturn(GPUDynInstPtr m) // Decrement outstanding request count computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); - if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) - || MO_H(m->m_op)) { + if (m->isStore() || m->isAtomic()) { computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm, m->time, -1); } - if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + if (m->isLoad() || m->isAtomic()) { computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm, m->time, -1); } diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh index c1f741d6a..13afab977 100644 --- a/src/gpu-compute/shader.hh +++ b/src/gpu-compute/shader.hh @@ -47,7 +47,6 @@ #include "cpu/simple_thread.hh" #include "cpu/thread_context.hh" #include "cpu/thread_state.hh" -#include "enums/MemOpType.hh" #include "enums/MemType.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_tlb.hh" diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc index c43d765af..c50c06cc6 100644 --- a/src/gpu-compute/vector_register_file.cc +++ b/src/gpu-compute/vector_register_file.cc @@ -38,7 +38,6 @@ #include <string> #include "base/misc.hh" -#include "gpu-compute/code_enums.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/shader.hh" @@ -153,8 +152,8 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const void VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w) { - bool loadInstr = IS_OT_READ(ii->opType()); - bool atomicInstr = IS_OT_ATOMIC(ii->opType()); + bool loadInstr = ii->isLoad(); + bool atomicInstr = ii->isAtomic() || ii->isMemFence(); bool loadNoArgInstr = loadInstr && !ii->isArgLoad(); diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index c677cbe41..caeed85a7 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -37,7 +37,6 @@ #include "debug/GPUExec.hh" #include "debug/WavefrontStack.hh" -#include "gpu-compute/code_enums.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/shader.hh" @@ -165,19 +164,8 @@ Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr) bool Wavefront::isGmInstruction(GPUDynInstPtr ii) { - if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || - IS_OT_ATOMIC_PM(ii->opType())) { + if (ii->isGlobalMem() || ii->isFlat()) return true; - } - - if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || - IS_OT_ATOMIC_GM(ii->opType())) { - return true; - } - - if (IS_OT_FLAT(ii->opType())) { - return true; - } return false; } @@ -185,8 +173,7 @@ Wavefront::isGmInstruction(GPUDynInstPtr ii) bool Wavefront::isLmInstruction(GPUDynInstPtr ii) { - if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) || - IS_OT_ATOMIC_LM(ii->opType())) { + if (ii->isLocalMem()) { return true; } @@ -199,10 +186,9 @@ Wavefront::isOldestInstALU() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP || - ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || - ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || - ii->opType() == Enums::OT_KERN_READ)) { + if (status != S_STOPPED && (ii->isNop() || + ii->isReturn() || ii->isBranch() || + ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) { return true; } @@ -215,7 +201,7 @@ Wavefront::isOldestInstBarrier() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) { + if (status != S_STOPPED && ii->isBarrier()) { return true; } @@ -228,9 +214,7 @@ Wavefront::isOldestInstGMem() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) || - IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { - + if (status != S_STOPPED && ii->isGlobalMem()) { return true; } @@ -243,9 +227,7 @@ Wavefront::isOldestInstLMem() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) || - IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { - + if (status != S_STOPPED && ii->isLocalMem()) { return true; } @@ -258,9 +240,7 @@ Wavefront::isOldestInstPrivMem() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) || - IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { - + if (status != S_STOPPED && ii->isPrivateSeg()) { return true; } @@ -273,8 +253,7 @@ Wavefront::isOldestInstFlatMem() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) { - + if (status != S_STOPPED && ii->isFlat()) { return true; } @@ -289,7 +268,7 @@ Wavefront::instructionBufferHasBranch() for (auto it : instructionBuffer) { GPUDynInstPtr ii = it; - if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) { + if (ii->isReturn() || ii->isBranch()) { return true; } } @@ -371,23 +350,16 @@ Wavefront::ready(itype_e type) // checking readiness will be fixed eventually. In the meantime, let's // make sure that we do not silently let an instruction type slip // through this logic and always return not ready. - if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP || - ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || - ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || - ii->opType() == Enums::OT_KERN_READ || - ii->opType() == Enums::OT_ARG || - IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || - IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) || - IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || - IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || - IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) { + if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() || + ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() || + ii->isMemFence() || ii->isFlat())) { panic("next instruction: %s is of unknown type\n", ii->disassemble()); } DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n", computeUnit->cu_id, simdId, wfSlotId, ii->disassemble()); - if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) { + if (type == I_ALU && ii->isBarrier()) { // Here for ALU instruction (barrier) if (!computeUnit->wfWait[simdId].prerdy()) { // Is wave slot free? @@ -400,7 +372,7 @@ Wavefront::ready(itype_e type) } ready_inst = true; - } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) { + } else if (type == I_ALU && ii->isNop()) { // Here for ALU instruction (nop) if (!computeUnit->wfWait[simdId].prerdy()) { // Is wave slot free? @@ -408,7 +380,7 @@ Wavefront::ready(itype_e type) } ready_inst = true; - } else if (type == I_ALU && ii->opType() == Enums::OT_RET) { + } else if (type == I_ALU && ii->isReturn()) { // Here for ALU instruction (return) if (!computeUnit->wfWait[simdId].prerdy()) { // Is wave slot free? @@ -421,10 +393,10 @@ Wavefront::ready(itype_e type) } ready_inst = true; - } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH || - ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || - ii->opType() == Enums::OT_KERN_READ || - ii->opType() == Enums::OT_ARG)) { + } else if (type == I_ALU && (ii->isBranch() || + ii->isALU() || + (ii->isKernArgSeg() && ii->isLoad()) || + ii->isArgSeg())) { // Here for ALU instruction (all others) if (!computeUnit->wfWait[simdId].prerdy()) { // Is alu slot free? @@ -439,18 +411,16 @@ Wavefront::ready(itype_e type) return 0; } ready_inst = true; - } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) || - IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { + } else if (type == I_GLOBAL && ii->isGlobalMem()) { // Here Global memory instruction - if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) { + if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) { // Are there in pipe or outstanding global memory write requests? if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) { return 0; } } - if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) || - IS_OT_HIST_GM(ii->opType())) { + if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) { // Are there in pipe or outstanding global memory read requests? if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) return 0; @@ -480,17 +450,15 @@ Wavefront::ready(itype_e type) return 0; } ready_inst = true; - } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) || - IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { + } else if (type == I_SHARED && ii->isLocalMem()) { // Here for Shared memory instruction - if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) { + if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) { if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) { return 0; } } - if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || - IS_OT_HIST_LM(ii->opType())) { + if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) { if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) { return 0; } @@ -519,47 +487,7 @@ Wavefront::ready(itype_e type) return 0; } ready_inst = true; - } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) || - IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { - // Here for Private memory instruction ------------------------ // - if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) { - if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) { - return 0; - } - } - - if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) || - IS_OT_HIST_PM(ii->opType())) { - if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) { - return 0; - } - } - - if (!glbMemBusRdy) { - // Is there an available VRF->Global memory read bus? - return 0; - } - - if (!glbMemIssueRdy) { - // Is wave slot free? - return 0; - } - - if (!computeUnit->globalMemoryPipe. - isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { - // Can we insert a new request to the Global Mem Request FIFO? - return 0; - } - // can we schedule source & destination operands on the VRF? - if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, - VrfAccessType::RD_WR)) { - return 0; - } - if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { - return 0; - } - ready_inst = true; - } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) { + } else if (type == I_FLAT && ii->isFlat()) { if (!glbMemBusRdy) { // Is there an available VRF->Global memory read bus? return 0; @@ -618,23 +546,22 @@ Wavefront::updateResources() assert(ii); computeUnit->vrf[simdId]->updateResources(this, ii); // Single precision ALU or Branch or Return or Special instruction - if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || - ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || + if (ii->isALU() || ii->isSpecialOp() || + ii->isBranch() || // FIXME: Kernel argument loads are currently treated as ALU operations // since we don't send memory packets at execution. If we fix that then // we should map them to one of the memory pipelines - ii->opType()==Enums::OT_KERN_READ || - ii->opType()==Enums::OT_ARG || - ii->opType()==Enums::OT_RET) { + (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() || + ii->isReturn()) { computeUnit->aluPipe[simdId].preset(computeUnit->shader-> ticks(computeUnit->spBypassLength())); // this is to enforce a fixed number of cycles per issue slot per SIMD computeUnit->wfWait[simdId].preset(computeUnit->shader-> ticks(computeUnit->issuePeriod)); - } else if (ii->opType() == Enums::OT_BARRIER) { + } else if (ii->isBarrier()) { computeUnit->wfWait[simdId].preset(computeUnit->shader-> ticks(computeUnit->issuePeriod)); - } else if (ii->opType() == Enums::OT_FLAT_READ) { + } else if (ii->isLoad() && ii->isFlat()) { assert(Enums::SC_NONE != ii->executedAs()); memReqsInPipe++; rdGmReqsInPipe++; @@ -649,7 +576,7 @@ Wavefront::updateResources() computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); } - } else if (ii->opType() == Enums::OT_FLAT_WRITE) { + } else if (ii->isStore() && ii->isFlat()) { assert(Enums::SC_NONE != ii->executedAs()); memReqsInPipe++; wrGmReqsInPipe++; @@ -664,21 +591,21 @@ Wavefront::updateResources() computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); } - } else if (IS_OT_READ_GM(ii->opType())) { + } else if (ii->isLoad() && ii->isGlobalMem()) { memReqsInPipe++; rdGmReqsInPipe++; computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. preset(computeUnit->shader->ticks(4)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_GM(ii->opType())) { + } else if (ii->isStore() && ii->isGlobalMem()) { memReqsInPipe++; wrGmReqsInPipe++; computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. preset(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_GM(ii->opType())) { + } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) { memReqsInPipe++; wrGmReqsInPipe++; rdGmReqsInPipe++; @@ -686,21 +613,21 @@ Wavefront::updateResources() preset(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_READ_LM(ii->opType())) { + } else if (ii->isLoad() && ii->isLocalMem()) { memReqsInPipe++; rdLmReqsInPipe++; computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. preset(computeUnit->shader->ticks(4)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_LM(ii->opType())) { + } else if (ii->isStore() && ii->isLocalMem()) { memReqsInPipe++; wrLmReqsInPipe++; computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. preset(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_LM(ii->opType())) { + } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) { memReqsInPipe++; wrLmReqsInPipe++; rdLmReqsInPipe++; @@ -708,28 +635,6 @@ Wavefront::updateResources() preset(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_READ_PM(ii->opType())) { - memReqsInPipe++; - rdGmReqsInPipe++; - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - preset(computeUnit->shader->ticks(4)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_PM(ii->opType())) { - memReqsInPipe++; - wrGmReqsInPipe++; - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - preset(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_PM(ii->opType())) { - memReqsInPipe++; - wrGmReqsInPipe++; - rdGmReqsInPipe++; - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - preset(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); } } @@ -751,7 +656,7 @@ Wavefront::exec() DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, ii->disassemble(), old_pc); - ii->execute(); + ii->execute(ii); // access the VRF computeUnit->vrf[simdId]->exec(ii, this); srcRegOpDist.sample(ii->numSrcRegOperands()); @@ -785,24 +690,24 @@ Wavefront::exec() // ---- Update Vector ALU pipeline and other resources ------------------ // // Single precision ALU or Branch or Return or Special instruction - if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || - ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || + if (ii->isALU() || ii->isSpecialOp() || + ii->isBranch() || // FIXME: Kernel argument loads are currently treated as ALU operations // since we don't send memory packets at execution. If we fix that then // we should map them to one of the memory pipelines - ii->opType() == Enums::OT_KERN_READ || - ii->opType() == Enums::OT_ARG || - ii->opType() == Enums::OT_RET) { + (ii->isKernArgSeg() && ii->isLoad()) || + ii->isArgSeg() || + ii->isReturn()) { computeUnit->aluPipe[simdId].set(computeUnit->shader-> ticks(computeUnit->spBypassLength())); // this is to enforce a fixed number of cycles per issue slot per SIMD computeUnit->wfWait[simdId].set(computeUnit->shader-> ticks(computeUnit->issuePeriod)); - } else if (ii->opType() == Enums::OT_BARRIER) { + } else if (ii->isBarrier()) { computeUnit->wfWait[simdId].set(computeUnit->shader-> ticks(computeUnit->issuePeriod)); - } else if (ii->opType() == Enums::OT_FLAT_READ) { + } else if (ii->isLoad() && ii->isFlat()) { assert(Enums::SC_NONE != ii->executedAs()); if (Enums::SC_SHARED == ii->executedAs()) { @@ -816,7 +721,7 @@ Wavefront::exec() computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); } - } else if (ii->opType() == Enums::OT_FLAT_WRITE) { + } else if (ii->isStore() && ii->isFlat()) { assert(Enums::SC_NONE != ii->executedAs()); if (Enums::SC_SHARED == ii->executedAs()) { computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. @@ -829,32 +734,32 @@ Wavefront::exec() computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); } - } else if (IS_OT_READ_GM(ii->opType())) { + } else if (ii->isLoad() && ii->isGlobalMem()) { computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. set(computeUnit->shader->ticks(4)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_GM(ii->opType())) { + } else if (ii->isStore() && ii->isGlobalMem()) { computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. set(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_GM(ii->opType())) { + } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) { computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. set(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_READ_LM(ii->opType())) { + } else if (ii->isLoad() && ii->isLocalMem()) { computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. set(computeUnit->shader->ticks(4)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_LM(ii->opType())) { + } else if (ii->isStore() && ii->isLocalMem()) { computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. set(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_LM(ii->opType())) { + } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) { computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. set(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. |