From 1a7d3f9fcb76a68540dd948f91413533a383bfde Mon Sep 17 00:00:00 2001 From: Tony Gutierrez Date: Tue, 19 Jan 2016 14:28:22 -0500 Subject: gpu-compute: AMD's baseline GPU model --- src/gpu-compute/gpu_dyn_inst.hh | 464 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 464 insertions(+) create mode 100644 src/gpu-compute/gpu_dyn_inst.hh (limited to 'src/gpu-compute/gpu_dyn_inst.hh') diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh new file mode 100644 index 000000000..e44d8f80d --- /dev/null +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -0,0 +1,464 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __GPU_DYN_INST_HH__ +#define __GPU_DYN_INST_HH__ + +#include +#include + +#include "enums/GenericMemoryOrder.hh" +#include "enums/GenericMemoryScope.hh" +#include "enums/MemOpType.hh" +#include "enums/MemType.hh" +#include "enums/OpType.hh" +#include "enums/StorageClassType.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_exec_context.hh" + +class GPUStaticInst; + +template +class AtomicOpAnd : public TypedAtomicOpFunctor +{ + public: + T a; + + AtomicOpAnd(T _a) : a(_a) { } + void execute(T *b) { *b &= a; } +}; + +template +class AtomicOpOr : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpOr(T _a) : a(_a) { } + void execute(T *b) { *b |= a; } +}; + +template +class AtomicOpXor : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpXor(T _a) : a(_a) {} + void execute(T *b) { *b ^= a; } +}; + +template +class AtomicOpCAS : public TypedAtomicOpFunctor +{ + public: + T c; + T s; + + ComputeUnit *computeUnit; + + AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit) + : c(_c), s(_s), computeUnit(compute_unit) { } + + void + execute(T *b) + { + computeUnit->numCASOps++; + + if (*b == c) { + *b = s; + } else { + computeUnit->numFailedCASOps++; + } + + if (computeUnit->xact_cas_mode) { + computeUnit->xactCasLoadMap.clear(); + } + } +}; + +template +class AtomicOpExch : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpExch(T _a) : a(_a) { } + void execute(T *b) { *b = a; } +}; + +template +class AtomicOpAdd : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpAdd(T _a) : a(_a) { } + void execute(T *b) { *b += a; } +}; + +template +class AtomicOpSub : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpSub(T _a) : a(_a) { } + void execute(T *b) { *b -= a; } +}; + +template +class AtomicOpInc : public TypedAtomicOpFunctor +{ + public: + AtomicOpInc() { } + void execute(T *b) { *b += 1; } +}; + +template +class AtomicOpDec : public TypedAtomicOpFunctor +{ + public: + AtomicOpDec() {} + void execute(T *b) { *b -= 1; } +}; + +template +class AtomicOpMax : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpMax(T _a) : a(_a) { } + + void + execute(T *b) + { + if (a > *b) + *b = a; + } +}; + +template +class AtomicOpMin : public TypedAtomicOpFunctor +{ + public: + T a; + AtomicOpMin(T _a) : a(_a) {} + + void + execute(T *b) + { + if (a < *b) + *b = a; + } +}; + +#define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN) +#define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN) +#define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN) + +typedef enum +{ + VT_32, + VT_64, +} vgpr_type; + +typedef enum +{ + SEG_PRIVATE, + SEG_SPILL, + SEG_GLOBAL, + SEG_SHARED, + SEG_READONLY, + SEG_FLAT +} seg_type; + +class GPUDynInst : public GPUExecContext +{ + public: + GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst, + uint64_t instSeqNum); + + void execute(); + int numSrcRegOperands(); + int numDstRegOperands(); + int getNumOperands(); + bool isVectorRegister(int operandIdx); + bool isScalarRegister(int operandIdx); + int getRegisterIndex(int operandIdx); + int getOperandSize(int operandIdx); + bool isDstOperand(int operandIdx); + bool isSrcOperand(int operandIdx); + bool isArgLoad(); + + const std::string &disassemble() const; + + uint64_t seqNum() const; + + Enums::OpType opType(); + Enums::StorageClassType executedAs(); + + // The address of the memory operation + Addr addr[VSZ]; + Addr pAddr; + + // The data to get written + uint8_t d_data[VSZ * 16]; + // Additional data (for atomics) + uint8_t a_data[VSZ * 8]; + // Additional data (for atomics) + uint8_t x_data[VSZ * 8]; + // The execution mask + VectorMask exec_mask; + + // The memory type (M_U32, M_S32, ...) + Enums::MemType m_type; + // The memory operation (MO_LD, MO_ST, ...) + Enums::MemOpType m_op; + Enums::GenericMemoryOrder memoryOrder; + + // Scope of the request + Enums::GenericMemoryScope scope; + // The memory segment (SEG_SHARED, SEG_GLOBAL, ...) + seg_type s_type; + // The equivalency class + int equiv; + // The return VGPR type (VT_32 or VT_64) + vgpr_type v_type; + // Number of VGPR's accessed (1, 2, or 4) + int n_reg; + // The return VGPR index + int dst_reg; + // There can be max 4 dest regs> + int dst_reg_vec[4]; + // SIMD where the WF of the memory instruction has been mapped to + int simdId; + // unique id of the WF where the memory instruction belongs to + int wfDynId; + // The kernel id of the requesting wf + int kern_id; + // The CU id of the requesting wf + int cu_id; + // HW slot id where the WF is mapped to inside a SIMD unit + int wfSlotId; + // execution pipeline id where the memory instruction has been scheduled + int pipeId; + // The execution time of this operation + Tick time; + // The latency of this operation + WaitClass latency; + // A list of bank conflicts for the 4 cycles. + uint32_t bc[4]; + + // A pointer to ROM + uint8_t *rom; + // The size of the READONLY segment + int sz_rom; + + // Initiate the specified memory operation, by creating a + // memory request and sending it off to the memory system. + void initiateAcc(GPUDynInstPtr gpuDynInst); + + void updateStats(); + + GPUStaticInst* staticInstruction() { return staticInst; } + + // Is the instruction a scalar or vector op? + bool scalarOp() const; + + /* + * Loads/stores/atomics may have acquire/release semantics associated + * withthem. Some protocols want to see the acquire/release as separate + * requests from the load/store/atomic. We implement that separation + * using continuations (i.e., a function pointer with an object associated + * with it). When, for example, the front-end generates a store with + * release semantics, we will first issue a normal store and set the + * continuation in the GPUDynInst to a function that generate a + * release request. That continuation will be called when the normal + * store completes (in ComputeUnit::DataPort::recvTimingResponse). The + * continuation will be called in the context of the same GPUDynInst + * that generated the initial store. + */ + std::function execContinuation; + + // when true, call execContinuation when response arrives + bool useContinuation; + + template AtomicOpFunctor* + makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op) + { + using namespace Enums; + + switch(op) { + case MO_AAND: + case MO_ANRAND: + return new AtomicOpAnd(*reg0); + case MO_AOR: + case MO_ANROR: + return new AtomicOpOr(*reg0); + case MO_AXOR: + case MO_ANRXOR: + return new AtomicOpXor(*reg0); + case MO_ACAS: + case MO_ANRCAS: + return new AtomicOpCAS(*reg0, *reg1, cu); + case MO_AEXCH: + case MO_ANREXCH: + return new AtomicOpExch(*reg0); + case MO_AADD: + case MO_ANRADD: + return new AtomicOpAdd(*reg0); + case MO_ASUB: + case MO_ANRSUB: + return new AtomicOpSub(*reg0); + case MO_AINC: + case MO_ANRINC: + return new AtomicOpInc(); + case MO_ADEC: + case MO_ANRDEC: + return new AtomicOpDec(); + case MO_AMAX: + case MO_ANRMAX: + return new AtomicOpMax(*reg0); + case MO_AMIN: + case MO_ANRMIN: + return new AtomicOpMin(*reg0); + default: + panic("Unrecognized atomic operation"); + } + } + + void + setRequestFlags(Request *req, bool setMemOrder=true) + { + // currently these are the easy scopes to deduce + switch (s_type) { + case SEG_PRIVATE: + req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT); + break; + case SEG_SPILL: + req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT); + break; + case SEG_GLOBAL: + req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT); + break; + case SEG_READONLY: + req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT); + break; + case SEG_SHARED: + req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT); + break; + case SEG_FLAT: + // TODO: translate to correct scope + assert(false); + default: + panic("Bad segment type"); + break; + } + + switch (scope) { + case Enums::MEMORY_SCOPE_NONE: + case Enums::MEMORY_SCOPE_WORKITEM: + break; + case Enums::MEMORY_SCOPE_WAVEFRONT: + req->setMemSpaceConfigFlags(Request::SCOPE_VALID | + Request::WAVEFRONT_SCOPE); + break; + case Enums::MEMORY_SCOPE_WORKGROUP: + req->setMemSpaceConfigFlags(Request::SCOPE_VALID | + Request::WORKGROUP_SCOPE); + break; + case Enums::MEMORY_SCOPE_DEVICE: + req->setMemSpaceConfigFlags(Request::SCOPE_VALID | + Request::DEVICE_SCOPE); + break; + case Enums::MEMORY_SCOPE_SYSTEM: + req->setMemSpaceConfigFlags(Request::SCOPE_VALID | + Request::SYSTEM_SCOPE); + break; + default: + panic("Bad scope type"); + break; + } + + if (setMemOrder) { + // set acquire and release flags + switch (memoryOrder){ + case Enums::MEMORY_ORDER_SC_ACQUIRE: + req->setFlags(Request::ACQUIRE); + break; + case Enums::MEMORY_ORDER_SC_RELEASE: + req->setFlags(Request::RELEASE); + break; + case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE: + req->setFlags(Request::ACQUIRE | Request::RELEASE); + break; + default: + break; + } + } + + // set atomic type + // currently, the instruction genenerator only produces atomic return + // but a magic instruction can produce atomic no return + if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB || + m_op == Enums::MO_AAND || m_op == Enums::MO_AOR || + m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX || + m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC || + m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH || + m_op == Enums::MO_ACAS) { + req->setFlags(Request::ATOMIC_RETURN_OP); + } else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB || + m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR || + m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX || + m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC || + m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH || + m_op == Enums::MO_ANRCAS) { + req->setFlags(Request::ATOMIC_NO_RETURN_OP); + } + } + + // Map returned packets and the addresses they satisfy with which lane they + // were requested from + typedef std::unordered_map> StatusVector; + StatusVector memStatusVector; + + // Track the status of memory requests per lane, a bit per lane + VectorMask statusBitVector; + // for ld_v# or st_v# + std::vector statusVector; + std::vector tlbHitLevel; + + private: + GPUStaticInst *staticInst; + uint64_t _seqNum; +}; + +#endif // __GPU_DYN_INST_HH__ -- cgit v1.2.3