/* * Copyright (c) 2015-2017 Advanced Micro Devices, Inc. * All rights reserved. * * For use for simulation and test purposes only * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Authors: Anthony Gutierrez */ #ifndef __GPU_DYN_INST_HH__ #define __GPU_DYN_INST_HH__ #include #include #include "base/logging.hh" #include "enums/MemType.hh" #include "enums/StorageClassType.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_exec_context.hh" class GPUStaticInst; template class AtomicOpAnd : public TypedAtomicOpFunctor { public: T a; AtomicOpAnd(T _a) : a(_a) { } void execute(T *b) { *b &= a; } AtomicOpFunctor* clone () { return new AtomicOpAnd(a); } }; template class AtomicOpOr : public TypedAtomicOpFunctor { public: T a; AtomicOpOr(T _a) : a(_a) { } void execute(T *b) { *b |= a; } AtomicOpFunctor* clone () { return new AtomicOpOr(a); } }; template class AtomicOpXor : public TypedAtomicOpFunctor { public: T a; AtomicOpXor(T _a) : a(_a) {} void execute(T *b) { *b ^= a; } AtomicOpFunctor* clone () { return new AtomicOpXor(a); } }; template class AtomicOpCAS : public TypedAtomicOpFunctor { public: T c; T s; ComputeUnit *computeUnit; AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit) : c(_c), s(_s), computeUnit(compute_unit) { } void execute(T *b) { computeUnit->numCASOps++; if (*b == c) { *b = s; } else { computeUnit->numFailedCASOps++; } if (computeUnit->xact_cas_mode) { computeUnit->xactCasLoadMap.clear(); } } AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); } }; template class AtomicOpExch : public TypedAtomicOpFunctor { public: T a; AtomicOpExch(T _a) : a(_a) { } void execute(T *b) { *b = a; } AtomicOpFunctor* clone () { return new AtomicOpExch(a); } }; template class AtomicOpAdd : public TypedAtomicOpFunctor { public: T a; AtomicOpAdd(T _a) : a(_a) { } void execute(T *b) { *b += a; } AtomicOpFunctor* clone () { return new AtomicOpAdd(a); } }; template class AtomicOpSub : public TypedAtomicOpFunctor { public: T a; AtomicOpSub(T _a) : a(_a) { } void execute(T *b) { *b -= a; } AtomicOpFunctor* clone () { return new AtomicOpSub(a); } }; template class AtomicOpInc : public TypedAtomicOpFunctor { public: AtomicOpInc() { } void execute(T *b) { *b += 1; } AtomicOpFunctor* clone () { return new AtomicOpInc(); } }; template class AtomicOpDec : public TypedAtomicOpFunctor { public: AtomicOpDec() {} void execute(T *b) { *b -= 1; } AtomicOpFunctor* clone () { return new AtomicOpDec(); } }; template class AtomicOpMax : public TypedAtomicOpFunctor { public: T a; AtomicOpMax(T _a) : a(_a) { } void execute(T *b) { if (a > *b) *b = a; } AtomicOpFunctor* clone () { return new AtomicOpMax(a); } }; template class AtomicOpMin : public TypedAtomicOpFunctor { public: T a; AtomicOpMin(T _a) : a(_a) {} void execute(T *b) { if (a < *b) *b = a; } AtomicOpFunctor* clone () { return new AtomicOpMin(a); } }; typedef enum { VT_32, VT_64, } vgpr_type; class GPUDynInst : public GPUExecContext { public: GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst, uint64_t instSeqNum); ~GPUDynInst(); void execute(GPUDynInstPtr gpuDynInst); int numSrcRegOperands(); int numDstRegOperands(); int getNumOperands(); bool isVectorRegister(int operandIdx); bool isScalarRegister(int operandIdx); bool isCondRegister(int operandIdx); int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst); int getOperandSize(int operandIdx); bool isDstOperand(int operandIdx); bool isSrcOperand(int operandIdx); const std::string &disassemble() const; uint64_t seqNum() const; Enums::StorageClassType executedAs(); // The address of the memory operation std::vector addr; Addr pAddr; // The data to get written uint8_t *d_data; // Additional data (for atomics) uint8_t *a_data; // Additional data (for atomics) uint8_t *x_data; // The execution mask VectorMask exec_mask; // The memory type (M_U32, M_S32, ...) Enums::MemType m_type; // The equivalency class int equiv; // The return VGPR type (VT_32 or VT_64) vgpr_type v_type; // Number of VGPR's accessed (1, 2, or 4) int n_reg; // The return VGPR index int dst_reg; // There can be max 4 dest regs> int dst_reg_vec[4]; // SIMD where the WF of the memory instruction has been mapped to int simdId; // unique id of the WF where the memory instruction belongs to int wfDynId; // The kernel id of the requesting wf int kern_id; // The CU id of the requesting wf int cu_id; // HW slot id where the WF is mapped to inside a SIMD unit int wfSlotId; // execution pipeline id where the memory instruction has been scheduled int pipeId; // The execution time of this operation Tick time; // The latency of this operation WaitClass latency; // A list of bank conflicts for the 4 cycles. uint32_t bc[4]; // A pointer to ROM uint8_t *rom; // The size of the READONLY segment int sz_rom; // Initiate the specified memory operation, by creating a // memory request and sending it off to the memory system. void initiateAcc(GPUDynInstPtr gpuDynInst); // Complete the specified memory operation, by writing // value back to the RF in the case of a load or atomic // return or, in the case of a store, we do nothing void completeAcc(GPUDynInstPtr gpuDynInst); void updateStats(); GPUStaticInst* staticInstruction() { return _staticInst; } bool isALU() const; bool isBranch() const; bool isNop() const; bool isReturn() const; bool isUnconditionalJump() const; bool isSpecialOp() const; bool isWaitcnt() const; bool isBarrier() const; bool isMemFence() const; bool isMemRef() const; bool isFlat() const; bool isLoad() const; bool isStore() const; bool isAtomic() const; bool isAtomicNoRet() const; bool isAtomicRet() const; bool isScalar() const; bool readsSCC() const; bool writesSCC() const; bool readsVCC() const; bool writesVCC() const; bool isAtomicAnd() const; bool isAtomicOr() const; bool isAtomicXor() const; bool isAtomicCAS() const; bool isAtomicExch() const; bool isAtomicAdd() const; bool isAtomicSub() const; bool isAtomicInc() const; bool isAtomicDec() const; bool isAtomicMax() const; bool isAtomicMin() const; bool isArgLoad() const; bool isGlobalMem() const; bool isLocalMem() const; bool isArgSeg() const; bool isGlobalSeg() const; bool isGroupSeg() const; bool isKernArgSeg() const; bool isPrivateSeg() const; bool isReadOnlySeg() const; bool isSpillSeg() const; bool isWorkitemScope() const; bool isWavefrontScope() const; bool isWorkgroupScope() const; bool isDeviceScope() const; bool isSystemScope() const; bool isNoScope() const; bool isRelaxedOrder() const; bool isAcquire() const; bool isRelease() const; bool isAcquireRelease() const; bool isNoOrder() const; bool isGloballyCoherent() const; bool isSystemCoherent() const; /* * Loads/stores/atomics may have acquire/release semantics associated * withthem. Some protocols want to see the acquire/release as separate * requests from the load/store/atomic. We implement that separation * using continuations (i.e., a function pointer with an object associated * with it). When, for example, the front-end generates a store with * release semantics, we will first issue a normal store and set the * continuation in the GPUDynInst to a function that generate a * release request. That continuation will be called when the normal * store completes (in ComputeUnit::DataPort::recvTimingResponse). The * continuation will be called in the context of the same GPUDynInst * that generated the initial store. */ std::function execContinuation; // when true, call execContinuation when response arrives bool useContinuation; template AtomicOpFunctor* makeAtomicOpFunctor(c0 *reg0, c0 *reg1) { if (isAtomicAnd()) { return new AtomicOpAnd(*reg0); } else if (isAtomicOr()) { return new AtomicOpOr(*reg0); } else if (isAtomicXor()) { return new AtomicOpXor(*reg0); } else if (isAtomicCAS()) { return new AtomicOpCAS(*reg0, *reg1, cu); } else if (isAtomicExch()) { return new AtomicOpExch(*reg0); } else if (isAtomicAdd()) { return new AtomicOpAdd(*reg0); } else if (isAtomicSub()) { return new AtomicOpSub(*reg0); } else if (isAtomicInc()) { return new AtomicOpInc(); } else if (isAtomicDec()) { return new AtomicOpDec(); } else if (isAtomicMax()) { return new AtomicOpMax(*reg0); } else if (isAtomicMin()) { return new AtomicOpMin(*reg0); } else { fatal("Unrecognized atomic operation"); } } void setRequestFlags(RequestPtr req, bool setMemOrder=true) { // currently these are the easy scopes to deduce if (isPrivateSeg()) { req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT); } else if (isSpillSeg()) { req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT); } else if (isGlobalSeg()) { req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT); } else if (isReadOnlySeg()) { req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT); } else if (isGroupSeg()) { req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT); } else if (isFlat()) { panic("TODO: translate to correct scope"); } else { fatal("%s has bad segment type\n", disassemble()); } if (isWavefrontScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::WAVEFRONT_SCOPE); } else if (isWorkgroupScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::WORKGROUP_SCOPE); } else if (isDeviceScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::DEVICE_SCOPE); } else if (isSystemScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::SYSTEM_SCOPE); } else if (!isNoScope() && !isWorkitemScope()) { fatal("%s has bad scope type\n", disassemble()); } if (setMemOrder) { // set acquire and release flags if (isAcquire()) { req->setFlags(Request::ACQUIRE); } else if (isRelease()) { req->setFlags(Request::RELEASE); } else if (isAcquireRelease()) { req->setFlags(Request::ACQUIRE | Request::RELEASE); } else if (!isNoOrder()) { fatal("%s has bad memory order\n", disassemble()); } } // set atomic type // currently, the instruction genenerator only produces atomic return // but a magic instruction can produce atomic no return if (isAtomicRet()) { req->setFlags(Request::ATOMIC_RETURN_OP); } else if (isAtomicNoRet()) { req->setFlags(Request::ATOMIC_NO_RETURN_OP); } } // Map returned packets and the addresses they satisfy with which lane they // were requested from typedef std::unordered_map> StatusVector; StatusVector memStatusVector; // Track the status of memory requests per lane, a bit per lane VectorMask statusBitVector; // for ld_v# or st_v# std::vector statusVector; std::vector tlbHitLevel; private: GPUStaticInst *_staticInst; uint64_t _seqNum; }; #endif // __GPU_DYN_INST_HH__