/* * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. * All rights reserved. * * For use for simulation and test purposes only * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Author: Steve Reinhardt */ #ifndef __ARCH_HSAIL_INSTS_MEM_HH__ #define __ARCH_HSAIL_INSTS_MEM_HH__ #include #include "arch/hsail/insts/decl.hh" #include "arch/hsail/insts/gpu_static_inst.hh" #include "arch/hsail/operand.hh" #include "gpu-compute/compute_unit.hh" namespace HsailISA { class MemInst { public: MemInst() : size(0), addr_operand(nullptr) { } MemInst(Enums::MemType m_type) { if (m_type == Enums::M_U64 || m_type == Enums::M_S64 || m_type == Enums::M_F64) { size = 8; } else if (m_type == Enums::M_U32 || m_type == Enums::M_S32 || m_type == Enums::M_F32) { size = 4; } else if (m_type == Enums::M_U16 || m_type == Enums::M_S16 || m_type == Enums::M_F16) { size = 2; } else { size = 1; } addr_operand = nullptr; } void init_addr(AddrOperandBase *_addr_operand) { addr_operand = _addr_operand; } private: int size; AddrOperandBase *addr_operand; public: int getMemOperandSize() { return size; } AddrOperandBase *getAddressOperand() { return addr_operand; } }; template class LdaInstBase : public HsailGPUStaticInst { public: typename DestOperandType::DestOperand dest; AddrOperandType addr; LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode) : HsailGPUStaticInst(obj, _opcode) { using namespace Brig; setFlag(ALU); unsigned op_offs = obj->getOperandPtr(ib->operands, 0); dest.init(op_offs, obj); op_offs = obj->getOperandPtr(ib->operands, 1); addr.init(op_offs, obj); } int numSrcRegOperands() override { return(this->addr.isVectorRegister()); } int numDstRegOperands() override { return dest.isVectorRegister(); } bool isVectorRegister(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); return((operandIndex == 0) ? dest.isVectorRegister() : this->addr.isVectorRegister()); } bool isCondRegister(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); return((operandIndex == 0) ? dest.isCondRegister() : this->addr.isCondRegister()); } bool isScalarRegister(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); return((operandIndex == 0) ? dest.isScalarRegister() : this->addr.isScalarRegister()); } bool isSrcOperand(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if (operandIndex > 0) return(this->addr.isVectorRegister()); return false; } bool isDstOperand(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); return(operandIndex == 0); } int getOperandSize(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); return((operandIndex == 0) ? dest.opSize() : this->addr.opSize()); } int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); return((operandIndex == 0) ? dest.regIndex() : this->addr.regIndex()); } int getNumOperands() override { if (this->addr.isVectorRegister()) return 2; return 1; } }; template class LdaInst : public LdaInstBase, public MemInst { public: void generateDisassembly(); LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode) : LdaInstBase(ib, obj, _opcode) { init_addr(&this->addr); } void execute(GPUDynInstPtr gpuDynInst); }; template GPUStaticInst* decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj) { unsigned op_offs = obj->getOperandPtr(ib->operands, 1); BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj); if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { return new LdaInst(ib, obj, "ldas"); } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { // V2/V4 not allowed switch (regDataType.regKind) { case Brig::BRIG_REGISTER_KIND_SINGLE: return new LdaInst(ib, obj, "ldas"); case Brig::BRIG_REGISTER_KIND_DOUBLE: return new LdaInst(ib, obj, "ldas"); default: fatal("Bad ldas register operand type %d\n", regDataType.type); } } else { fatal("Bad ldas register operand kind %d\n", regDataType.kind); } } template class LdInstBase : public HsailGPUStaticInst { public: Brig::BrigWidth8_t width; typename DestOperandType::DestOperand dest; AddrOperandType addr; Brig::BrigSegment segment; Brig::BrigMemoryOrder memoryOrder; Brig::BrigMemoryScope memoryScope; unsigned int equivClass; LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode) : HsailGPUStaticInst(obj, _opcode) { using namespace Brig; setFlag(MemoryRef); setFlag(Load); if (ib->opcode == BRIG_OPCODE_LD) { const BrigInstMem *ldst = (const BrigInstMem*)ib; segment = (BrigSegment)ldst->segment; memoryOrder = BRIG_MEMORY_ORDER_NONE; memoryScope = BRIG_MEMORY_SCOPE_NONE; equivClass = ldst->equivClass; width = ldst->width; unsigned op_offs = obj->getOperandPtr(ib->operands, 0); const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) dest.init(op_offs, obj); op_offs = obj->getOperandPtr(ib->operands, 1); addr.init(op_offs, obj); } else { const BrigInstAtomic *at = (const BrigInstAtomic*)ib; segment = (BrigSegment)at->segment; memoryOrder = (BrigMemoryOrder)at->memoryOrder; memoryScope = (BrigMemoryScope)at->memoryScope; equivClass = 0; width = BRIG_WIDTH_1; unsigned op_offs = obj->getOperandPtr(ib->operands, 0); const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) dest.init(op_offs, obj); op_offs = obj->getOperandPtr(ib->operands,1); addr.init(op_offs, obj); } switch (memoryOrder) { case BRIG_MEMORY_ORDER_NONE: setFlag(NoOrder); break; case BRIG_MEMORY_ORDER_RELAXED: setFlag(RelaxedOrder); break; case BRIG_MEMORY_ORDER_SC_ACQUIRE: setFlag(Acquire); break; case BRIG_MEMORY_ORDER_SC_RELEASE: setFlag(Release); break; case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: setFlag(AcquireRelease); break; default: fatal("LdInst has bad memory order type\n"); } switch (memoryScope) { case BRIG_MEMORY_SCOPE_NONE: setFlag(NoScope); break; case BRIG_MEMORY_SCOPE_WORKITEM: setFlag(WorkitemScope); break; case BRIG_MEMORY_SCOPE_WORKGROUP: setFlag(WorkgroupScope); break; case BRIG_MEMORY_SCOPE_AGENT: setFlag(DeviceScope); break; case BRIG_MEMORY_SCOPE_SYSTEM: setFlag(SystemScope); break; default: fatal("LdInst has bad memory scope type\n"); } switch (segment) { case BRIG_SEGMENT_GLOBAL: setFlag(GlobalSegment); break; case BRIG_SEGMENT_GROUP: setFlag(GroupSegment); break; case BRIG_SEGMENT_PRIVATE: setFlag(PrivateSegment); break; case BRIG_SEGMENT_READONLY: setFlag(ReadOnlySegment); break; case BRIG_SEGMENT_SPILL: setFlag(SpillSegment); break; case BRIG_SEGMENT_FLAT: setFlag(Flat); break; case BRIG_SEGMENT_KERNARG: setFlag(KernArgSegment); break; case BRIG_SEGMENT_ARG: setFlag(ArgSegment); break; default: panic("Ld: segment %d not supported\n", segment); } } int numSrcRegOperands() override { return(this->addr.isVectorRegister()); } int numDstRegOperands() override { return dest.isVectorRegister(); } int getNumOperands() override { if (this->addr.isVectorRegister()) return 2; else return 1; } bool isVectorRegister(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); return((operandIndex == 0) ? dest.isVectorRegister() : this->addr.isVectorRegister()); } bool isCondRegister(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); return((operandIndex == 0) ? dest.isCondRegister() : this->addr.isCondRegister()); } bool isScalarRegister(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); return((operandIndex == 0) ? dest.isScalarRegister() : this->addr.isScalarRegister()); } bool isSrcOperand(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if (operandIndex > 0) return(this->addr.isVectorRegister()); return false; } bool isDstOperand(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); return(operandIndex == 0); } int getOperandSize(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); return((operandIndex == 0) ? dest.opSize() : this->addr.opSize()); } int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); return((operandIndex == 0) ? dest.regIndex() : this->addr.regIndex()); } }; template class LdInst : public LdInstBase, public MemInst { typename DestDataType::OperandType::DestOperand dest_vect[4]; uint16_t num_dest_operands; void generateDisassembly() override; public: LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode) : LdInstBase(ib, obj, _opcode), MemInst(MemDataType::memType) { init_addr(&this->addr); unsigned op_offs = obj->getOperandPtr(ib->operands,0); const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { const Brig::BrigOperandOperandList *brigRegVecOp = (const Brig::BrigOperandOperandList*)brigOp; num_dest_operands = *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4; assert(num_dest_operands <= 4); } else { num_dest_operands = 1; } if (num_dest_operands > 1) { assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); for (int i = 0; i < num_dest_operands; ++i) { dest_vect[i].init_from_vect(op_offs, obj, i); } } } void initiateAcc(GPUDynInstPtr gpuDynInst) override { typedef typename MemDataType::CType c0; gpuDynInst->statusBitVector = gpuDynInst->exec_mask; if (num_dest_operands > 1) { for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) if (gpuDynInst->exec_mask[i]) gpuDynInst->statusVector.push_back(num_dest_operands); else gpuDynInst->statusVector.push_back(0); } for (int k = 0; k < num_dest_operands; ++k) { c0 *d = &((c0*)gpuDynInst->d_data) [k * gpuDynInst->computeUnit()->wfSize()]; for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { if (gpuDynInst->exec_mask[i]) { Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); if (this->isLocalMem()) { // load from shared memory *d = gpuDynInst->wavefront()->ldsChunk-> read(vaddr); } else { RequestPtr req = std::make_shared(0, vaddr, sizeof(c0), 0, gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId); gpuDynInst->setRequestFlags(req); PacketPtr pkt = new Packet(req, MemCmd::ReadReq); pkt->dataStatic(d); if (gpuDynInst->computeUnit()->shader-> separate_acquire_release && gpuDynInst->isAcquire()) { // if this load has acquire semantics, // set the response continuation function // to perform an Acquire request gpuDynInst->execContinuation = &GPUStaticInst::execLdAcq; gpuDynInst->useContinuation = true; } else { // the request will be finished when // the load completes gpuDynInst->useContinuation = false; } // translation is performed in sendRequest() gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i, pkt); } } ++d; } } gpuDynInst->updateStats(); } void completeAcc(GPUDynInstPtr gpuDynInst) override { typedef typename MemDataType::CType c1; constexpr bool is_vt_32 = DestDataType::vgprType == VT_32; /** * this code essentially replaces the long if-else chain * that was in used GlobalMemPipeline::exec() to infer the * size (single/double) and type (floating point/integer) of * the destination register. this is needed for load * instructions because the loaded value and the * destination type can be of different sizes, and we also * need to know if the value we're writing back is floating * point and signed/unsigned, so we can properly cast the * writeback value */ typedef typename std::conditional::value, float, typename std::conditional::value, int32_t, uint32_t>::type>::type, typename std::conditional::value, double, typename std::conditional::value, int64_t, uint64_t>::type>::type>::type c0; Wavefront *w = gpuDynInst->wavefront(); std::vector regVec; // iterate over number of destination register operands since // this is a load for (int k = 0; k < num_dest_operands; ++k) { assert((sizeof(c1) * num_dest_operands) <= MAX_WIDTH_FOR_MEM_INST); int dst = this->dest.regIndex() + k; if (num_dest_operands > MAX_REGS_FOR_NON_VEC_MEM_INST) dst = dest_vect[k].regIndex(); // virtual->physical VGPR mapping int physVgpr = w->remap(dst, sizeof(c0), 1); // save the physical VGPR index regVec.push_back(physVgpr); c1 *p1 = &((c1*)gpuDynInst->d_data)[k * w->computeUnit->wfSize()]; for (int i = 0; i < w->computeUnit->wfSize(); ++i) { if (gpuDynInst->exec_mask[i]) { DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " "$%s%d <- %d global ld done (src = wavefront " "ld inst)\n", w->computeUnit->cu_id, w->simdId, w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d", dst, *p1); // write the value into the physical VGPR. This is a // purely functional operation. No timing is modeled. w->computeUnit->vrf[w->simdId]->write(physVgpr, *p1, i); } ++p1; } } // Schedule the write operation of the load data on the VRF. // This simply models the timing aspect of the VRF write operation. // It does not modify the physical VGPR. int loadVrfBankConflictCycles = gpuDynInst->computeUnit()-> vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec, sizeof(c0), gpuDynInst->time); if (this->isGlobalMem()) { gpuDynInst->computeUnit()->globalMemoryPipe .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); } else { assert(this->isLocalMem()); gpuDynInst->computeUnit()->localMemoryPipe .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); } } private: void execLdAcq(GPUDynInstPtr gpuDynInst) override { // after the load has complete and if the load has acquire // semantics, issue an acquire request. if (!this->isLocalMem()) { if (gpuDynInst->computeUnit()->shader->separate_acquire_release && gpuDynInst->isAcquire()) { gpuDynInst->statusBitVector = VectorMask(1); gpuDynInst->useContinuation = false; // create request RequestPtr req = std::make_shared(0, 0, 0, 0, gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId); req->setFlags(Request::ACQUIRE); gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); } } } public: bool isVectorRegister(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if ((num_dest_operands != getNumOperands()) && (operandIndex == (getNumOperands()-1))) return(this->addr.isVectorRegister()); if (num_dest_operands > 1) { return dest_vect[operandIndex].isVectorRegister(); } else if (num_dest_operands == 1) { return LdInstBase::dest.isVectorRegister(); } return false; } bool isCondRegister(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if ((num_dest_operands != getNumOperands()) && (operandIndex == (getNumOperands()-1))) return(this->addr.isCondRegister()); if (num_dest_operands > 1) return dest_vect[operandIndex].isCondRegister(); else if (num_dest_operands == 1) return LdInstBase::dest.isCondRegister(); return false; } bool isScalarRegister(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if ((num_dest_operands != getNumOperands()) && (operandIndex == (getNumOperands()-1))) return(this->addr.isScalarRegister()); if (num_dest_operands > 1) return dest_vect[operandIndex].isScalarRegister(); else if (num_dest_operands == 1) return LdInstBase::dest.isScalarRegister(); return false; } bool isSrcOperand(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if ((num_dest_operands != getNumOperands()) && (operandIndex == (getNumOperands()-1))) return(this->addr.isVectorRegister()); return false; } bool isDstOperand(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if ((num_dest_operands != getNumOperands()) && (operandIndex == (getNumOperands()-1))) return false; return true; } int getOperandSize(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if ((num_dest_operands != getNumOperands()) && (operandIndex == (getNumOperands()-1))) return(this->addr.opSize()); if (num_dest_operands > 1) return(dest_vect[operandIndex].opSize()); else if (num_dest_operands == 1) return(LdInstBase::dest.opSize()); return 0; } int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if ((num_dest_operands != getNumOperands()) && (operandIndex == (getNumOperands()-1))) return(this->addr.regIndex()); if (num_dest_operands > 1) return(dest_vect[operandIndex].regIndex()); else if (num_dest_operands == 1) return(LdInstBase::dest.regIndex()); return -1; } int getNumOperands() override { if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) return(num_dest_operands+1); else return(num_dest_operands); } void execute(GPUDynInstPtr gpuDynInst) override; }; template GPUStaticInst* decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj) { unsigned op_offs = obj->getOperandPtr(ib->operands,1); BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { return new LdInst(ib, obj, "ld"); } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER || tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { switch (tmp.regKind) { case Brig::BRIG_REGISTER_KIND_SINGLE: return new LdInst(ib, obj, "ld"); case Brig::BRIG_REGISTER_KIND_DOUBLE: return new LdInst(ib, obj, "ld"); default: fatal("Bad ld register operand type %d\n", tmp.regKind); } } else { fatal("Bad ld register operand kind %d\n", tmp.kind); } } template GPUStaticInst* decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj) { unsigned op_offs = obj->getOperandPtr(ib->operands,0); BrigRegOperandInfo dest = findRegDataType(op_offs, obj); assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER || dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); switch(dest.regKind) { case Brig::BRIG_REGISTER_KIND_SINGLE: switch (ib->type) { case Brig::BRIG_TYPE_B8: case Brig::BRIG_TYPE_B16: case Brig::BRIG_TYPE_B32: return decodeLd2(ib, obj); case Brig::BRIG_TYPE_U8: case Brig::BRIG_TYPE_U16: case Brig::BRIG_TYPE_U32: return decodeLd2(ib, obj); case Brig::BRIG_TYPE_S8: case Brig::BRIG_TYPE_S16: case Brig::BRIG_TYPE_S32: return decodeLd2(ib, obj); case Brig::BRIG_TYPE_F16: case Brig::BRIG_TYPE_F32: return decodeLd2(ib, obj); default: fatal("Bad ld register operand type %d, %d\n", dest.regKind, ib->type); }; case Brig::BRIG_REGISTER_KIND_DOUBLE: switch (ib->type) { case Brig::BRIG_TYPE_B64: return decodeLd2(ib, obj); case Brig::BRIG_TYPE_U64: return decodeLd2(ib, obj); case Brig::BRIG_TYPE_S64: return decodeLd2(ib, obj); case Brig::BRIG_TYPE_F64: return decodeLd2(ib, obj); default: fatal("Bad ld register operand type %d, %d\n", dest.regKind, ib->type); }; default: fatal("Bad ld register operand type %d, %d\n", dest.regKind, ib->type); } } template class StInstBase : public HsailGPUStaticInst { public: typename SrcOperandType::SrcOperand src; AddrOperandType addr; Brig::BrigSegment segment; Brig::BrigMemoryScope memoryScope; Brig::BrigMemoryOrder memoryOrder; unsigned int equivClass; StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode) : HsailGPUStaticInst(obj, _opcode) { using namespace Brig; setFlag(MemoryRef); setFlag(Store); if (ib->opcode == BRIG_OPCODE_ST) { const BrigInstMem *ldst = (const BrigInstMem*)ib; segment = (BrigSegment)ldst->segment; memoryOrder = BRIG_MEMORY_ORDER_NONE; memoryScope = BRIG_MEMORY_SCOPE_NONE; equivClass = ldst->equivClass; unsigned op_offs = obj->getOperandPtr(ib->operands, 0); const BrigOperand *baseOp = obj->getOperand(op_offs); if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) || (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) { src.init(op_offs, obj); } op_offs = obj->getOperandPtr(ib->operands, 1); addr.init(op_offs, obj); } else { const BrigInstAtomic *at = (const BrigInstAtomic*)ib; segment = (BrigSegment)at->segment; memoryScope = (BrigMemoryScope)at->memoryScope; memoryOrder = (BrigMemoryOrder)at->memoryOrder; equivClass = 0; unsigned op_offs = obj->getOperandPtr(ib->operands, 0); addr.init(op_offs, obj); op_offs = obj->getOperandPtr(ib->operands, 1); src.init(op_offs, obj); } switch (memoryOrder) { case BRIG_MEMORY_ORDER_NONE: setFlag(NoOrder); break; case BRIG_MEMORY_ORDER_RELAXED: setFlag(RelaxedOrder); break; case BRIG_MEMORY_ORDER_SC_ACQUIRE: setFlag(Acquire); break; case BRIG_MEMORY_ORDER_SC_RELEASE: setFlag(Release); break; case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: setFlag(AcquireRelease); break; default: fatal("StInst has bad memory order type\n"); } switch (memoryScope) { case BRIG_MEMORY_SCOPE_NONE: setFlag(NoScope); break; case BRIG_MEMORY_SCOPE_WORKITEM: setFlag(WorkitemScope); break; case BRIG_MEMORY_SCOPE_WORKGROUP: setFlag(WorkgroupScope); break; case BRIG_MEMORY_SCOPE_AGENT: setFlag(DeviceScope); break; case BRIG_MEMORY_SCOPE_SYSTEM: setFlag(SystemScope); break; default: fatal("StInst has bad memory scope type\n"); } switch (segment) { case BRIG_SEGMENT_GLOBAL: setFlag(GlobalSegment); break; case BRIG_SEGMENT_GROUP: setFlag(GroupSegment); break; case BRIG_SEGMENT_PRIVATE: setFlag(PrivateSegment); break; case BRIG_SEGMENT_READONLY: setFlag(ReadOnlySegment); break; case BRIG_SEGMENT_SPILL: setFlag(SpillSegment); break; case BRIG_SEGMENT_FLAT: setFlag(Flat); break; case BRIG_SEGMENT_ARG: setFlag(ArgSegment); break; default: panic("St: segment %d not supported\n", segment); } } int numDstRegOperands() override { return 0; } int numSrcRegOperands() override { return src.isVectorRegister() + this->addr.isVectorRegister(); } int getNumOperands() override { if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) return 2; else return 1; } bool isVectorRegister(int operandIndex) override { assert(operandIndex >= 0 && operandIndex < getNumOperands()); return !operandIndex ? src.isVectorRegister() : this->addr.isVectorRegister(); } bool isCondRegister(int operandIndex) override { assert(operandIndex >= 0 && operandIndex < getNumOperands()); return !operandIndex ? src.isCondRegister() : this->addr.isCondRegister(); } bool isScalarRegister(int operandIndex) override { assert(operandIndex >= 0 && operandIndex < getNumOperands()); return !operandIndex ? src.isScalarRegister() : this->addr.isScalarRegister(); } bool isSrcOperand(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); return true; } bool isDstOperand(int operandIndex) override { return false; } int getOperandSize(int operandIndex) override { assert(operandIndex >= 0 && operandIndex < getNumOperands()); return !operandIndex ? src.opSize() : this->addr.opSize(); } int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override { assert(operandIndex >= 0 && operandIndex < getNumOperands()); return !operandIndex ? src.regIndex() : this->addr.regIndex(); } }; template class StInst : public StInstBase, public MemInst { public: typename SrcDataType::OperandType::SrcOperand src_vect[4]; uint16_t num_src_operands; void generateDisassembly() override; StInst(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode, int srcIdx) : StInstBase(ib, obj, _opcode), MemInst(SrcDataType::memType) { init_addr(&this->addr); BrigRegOperandInfo rinfo; unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx); const Brig::BrigOperand *baseOp = obj->getOperand(op_offs); if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) { const Brig::BrigOperandConstantBytes *op = (Brig::BrigOperandConstantBytes*)baseOp; rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind, Brig::BRIG_TYPE_NONE); } else { rinfo = findRegDataType(op_offs, obj); } if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { const Brig::BrigOperandOperandList *brigRegVecOp = (const Brig::BrigOperandOperandList*)baseOp; num_src_operands = *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4; assert(num_src_operands <= 4); } else { num_src_operands = 1; } if (num_src_operands > 1) { assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); for (int i = 0; i < num_src_operands; ++i) { src_vect[i].init_from_vect(op_offs, obj, i); } } } void initiateAcc(GPUDynInstPtr gpuDynInst) override { // before performing a store, check if this store has // release semantics, and if so issue a release first if (!this->isLocalMem()) { if (gpuDynInst->computeUnit()->shader->separate_acquire_release && gpuDynInst->isRelease()) { gpuDynInst->statusBitVector = VectorMask(1); gpuDynInst->execContinuation = &GPUStaticInst::execSt; gpuDynInst->useContinuation = true; // create request RequestPtr req = std::make_shared(0, 0, 0, 0, gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId); req->setFlags(Request::RELEASE); gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); return; } } // if there is no release semantic, perform stores immediately execSt(gpuDynInst); } // stores don't write anything back, so there is nothing // to do here. we only override this method to avoid the // fatal in the base class implementation void completeAcc(GPUDynInstPtr gpuDynInst) override { } private: // execSt may be called through a continuation // if the store had release semantics. see comment for // execSt in gpu_static_inst.hh void execSt(GPUDynInstPtr gpuDynInst) override { typedef typename MemDataType::CType c0; gpuDynInst->statusBitVector = gpuDynInst->exec_mask; if (num_src_operands > 1) { for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) if (gpuDynInst->exec_mask[i]) gpuDynInst->statusVector.push_back(num_src_operands); else gpuDynInst->statusVector.push_back(0); } for (int k = 0; k < num_src_operands; ++k) { c0 *d = &((c0*)gpuDynInst->d_data) [k * gpuDynInst->computeUnit()->wfSize()]; for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { if (gpuDynInst->exec_mask[i]) { Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); if (this->isLocalMem()) { //store to shared memory gpuDynInst->wavefront()->ldsChunk->write(vaddr, *d); } else { RequestPtr req = std::make_shared( 0, vaddr, sizeof(c0), 0, gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId); gpuDynInst->setRequestFlags(req); PacketPtr pkt = new Packet(req, MemCmd::WriteReq); pkt->dataStatic(d); // translation is performed in sendRequest() // the request will be finished when the store completes gpuDynInst->useContinuation = false; gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i, pkt); } } ++d; } } gpuDynInst->updateStats(); } public: bool isVectorRegister(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if (operandIndex == num_src_operands) return this->addr.isVectorRegister(); if (num_src_operands > 1) return src_vect[operandIndex].isVectorRegister(); else if (num_src_operands == 1) return StInstBase::src.isVectorRegister(); return false; } bool isCondRegister(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if (operandIndex == num_src_operands) return this->addr.isCondRegister(); if (num_src_operands > 1) return src_vect[operandIndex].isCondRegister(); else if (num_src_operands == 1) return StInstBase::src.isCondRegister(); return false; } bool isScalarRegister(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if (operandIndex == num_src_operands) return this->addr.isScalarRegister(); if (num_src_operands > 1) return src_vect[operandIndex].isScalarRegister(); else if (num_src_operands == 1) return StInstBase::src.isScalarRegister(); return false; } bool isSrcOperand(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); return true; } bool isDstOperand(int operandIndex) override { return false; } int getOperandSize(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if (operandIndex == num_src_operands) return this->addr.opSize(); if (num_src_operands > 1) return src_vect[operandIndex].opSize(); else if (num_src_operands == 1) return StInstBase::src.opSize(); return 0; } int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if (operandIndex == num_src_operands) return this->addr.regIndex(); if (num_src_operands > 1) return src_vect[operandIndex].regIndex(); else if (num_src_operands == 1) return StInstBase::src.regIndex(); return -1; } int getNumOperands() override { if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) return num_src_operands + 1; else return num_src_operands; } void execute(GPUDynInstPtr gpuDynInst) override; }; template GPUStaticInst* decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj) { int srcIdx = 0; int destIdx = 1; if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC || ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) { srcIdx = 1; destIdx = 0; } unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx); BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { return new StInst(ib, obj, "st", srcIdx); } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { // V2/V4 not allowed switch (tmp.regKind) { case Brig::BRIG_REGISTER_KIND_SINGLE: return new StInst(ib, obj, "st", srcIdx); case Brig::BRIG_REGISTER_KIND_DOUBLE: return new StInst(ib, obj, "st", srcIdx); default: fatal("Bad st register operand type %d\n", tmp.type); } } else { fatal("Bad st register operand kind %d\n", tmp.kind); } } template class AtomicInstBase : public HsailGPUStaticInst { public: typename OperandType::DestOperand dest; typename OperandType::SrcOperand src[NumSrcOperands]; AddrOperandType addr; Brig::BrigSegment segment; Brig::BrigMemoryOrder memoryOrder; Brig::BrigAtomicOperation atomicOperation; Brig::BrigMemoryScope memoryScope; Brig::BrigOpcode opcode; AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode) : HsailGPUStaticInst(obj, _opcode) { using namespace Brig; const BrigInstAtomic *at = (const BrigInstAtomic*)ib; segment = (BrigSegment)at->segment; memoryScope = (BrigMemoryScope)at->memoryScope; memoryOrder = (BrigMemoryOrder)at->memoryOrder; atomicOperation = (BrigAtomicOperation)at->atomicOperation; opcode = (BrigOpcode)ib->opcode; assert(opcode == Brig::BRIG_OPCODE_ATOMICNORET || opcode == Brig::BRIG_OPCODE_ATOMIC); setFlag(MemoryRef); if (opcode == Brig::BRIG_OPCODE_ATOMIC) { setFlag(AtomicReturn); } else { setFlag(AtomicNoReturn); } switch (memoryOrder) { case BRIG_MEMORY_ORDER_NONE: setFlag(NoOrder); break; case BRIG_MEMORY_ORDER_RELAXED: setFlag(RelaxedOrder); break; case BRIG_MEMORY_ORDER_SC_ACQUIRE: setFlag(Acquire); break; case BRIG_MEMORY_ORDER_SC_RELEASE: setFlag(Release); break; case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: setFlag(AcquireRelease); break; default: fatal("AtomicInst has bad memory order type\n"); } switch (memoryScope) { case BRIG_MEMORY_SCOPE_NONE: setFlag(NoScope); break; case BRIG_MEMORY_SCOPE_WORKITEM: setFlag(WorkitemScope); break; case BRIG_MEMORY_SCOPE_WORKGROUP: setFlag(WorkgroupScope); break; case BRIG_MEMORY_SCOPE_AGENT: setFlag(DeviceScope); break; case BRIG_MEMORY_SCOPE_SYSTEM: setFlag(SystemScope); break; default: fatal("AtomicInst has bad memory scope type\n"); } switch (atomicOperation) { case Brig::BRIG_ATOMIC_AND: setFlag(AtomicAnd); break; case Brig::BRIG_ATOMIC_OR: setFlag(AtomicOr); break; case Brig::BRIG_ATOMIC_XOR: setFlag(AtomicXor); break; case Brig::BRIG_ATOMIC_CAS: setFlag(AtomicCAS); break; case Brig::BRIG_ATOMIC_EXCH: setFlag(AtomicExch); break; case Brig::BRIG_ATOMIC_ADD: setFlag(AtomicAdd); break; case Brig::BRIG_ATOMIC_WRAPINC: setFlag(AtomicInc); break; case Brig::BRIG_ATOMIC_WRAPDEC: setFlag(AtomicDec); break; case Brig::BRIG_ATOMIC_MIN: setFlag(AtomicMin); break; case Brig::BRIG_ATOMIC_MAX: setFlag(AtomicMax); break; case Brig::BRIG_ATOMIC_SUB: setFlag(AtomicSub); break; default: fatal("Bad BrigAtomicOperation code %d\n", atomicOperation); } switch (segment) { case BRIG_SEGMENT_GLOBAL: setFlag(GlobalSegment); break; case BRIG_SEGMENT_GROUP: setFlag(GroupSegment); break; case BRIG_SEGMENT_FLAT: setFlag(Flat); break; default: panic("Atomic: segment %d not supported\n", segment); } if (HasDst) { unsigned op_offs = obj->getOperandPtr(ib->operands, 0); dest.init(op_offs, obj); op_offs = obj->getOperandPtr(ib->operands, 1); addr.init(op_offs, obj); for (int i = 0; i < NumSrcOperands; ++i) { op_offs = obj->getOperandPtr(ib->operands, i + 2); src[i].init(op_offs, obj); } } else { unsigned op_offs = obj->getOperandPtr(ib->operands, 0); addr.init(op_offs, obj); for (int i = 0; i < NumSrcOperands; ++i) { op_offs = obj->getOperandPtr(ib->operands, i + 1); src[i].init(op_offs, obj); } } } int numSrcRegOperands() { int operands = 0; for (int i = 0; i < NumSrcOperands; i++) { if (src[i].isVectorRegister()) { operands++; } } if (addr.isVectorRegister()) operands++; return operands; } int numDstRegOperands() { return dest.isVectorRegister(); } int getNumOperands() { if (addr.isVectorRegister()) return(NumSrcOperands + 2); return(NumSrcOperands + 1); } bool isVectorRegister(int operandIndex) { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if (operandIndex < NumSrcOperands) return src[operandIndex].isVectorRegister(); else if (operandIndex == NumSrcOperands) return(addr.isVectorRegister()); else return dest.isVectorRegister(); } bool isCondRegister(int operandIndex) { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if (operandIndex < NumSrcOperands) return src[operandIndex].isCondRegister(); else if (operandIndex == NumSrcOperands) return(addr.isCondRegister()); else return dest.isCondRegister(); } bool isScalarRegister(int operandIndex) { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if (operandIndex < NumSrcOperands) return src[operandIndex].isScalarRegister(); else if (operandIndex == NumSrcOperands) return(addr.isScalarRegister()); else return dest.isScalarRegister(); } bool isSrcOperand(int operandIndex) { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if (operandIndex < NumSrcOperands) return true; else if (operandIndex == NumSrcOperands) return(addr.isVectorRegister()); else return false; } bool isDstOperand(int operandIndex) { if (operandIndex <= NumSrcOperands) return false; else return true; } int getOperandSize(int operandIndex) { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if (operandIndex < NumSrcOperands) return(src[operandIndex].opSize()); else if (operandIndex == NumSrcOperands) return(addr.opSize()); else return(dest.opSize()); } int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); if (operandIndex < NumSrcOperands) return(src[operandIndex].regIndex()); else if (operandIndex == NumSrcOperands) return(addr.regIndex()); else return(dest.regIndex()); return -1; } }; template class AtomicInst : public AtomicInstBase, public MemInst { public: void generateDisassembly() override; AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode) : AtomicInstBase (ib, obj, _opcode), MemInst(MemDataType::memType) { init_addr(&this->addr); } void initiateAcc(GPUDynInstPtr gpuDynInst) override { // before doing the RMW, check if this atomic has // release semantics, and if so issue a release first if (!this->isLocalMem()) { if (gpuDynInst->computeUnit()->shader->separate_acquire_release && (gpuDynInst->isRelease() || gpuDynInst->isAcquireRelease())) { gpuDynInst->statusBitVector = VectorMask(1); gpuDynInst->execContinuation = &GPUStaticInst::execAtomic; gpuDynInst->useContinuation = true; // create request RequestPtr req = std::make_shared(0, 0, 0, 0, gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId); req->setFlags(Request::RELEASE); gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); return; } } // if there is no release semantic, execute the RMW immediately execAtomic(gpuDynInst); } void completeAcc(GPUDynInstPtr gpuDynInst) override { // if this is not an atomic return op, then we // have nothing more to do. if (this->isAtomicRet()) { // the size of the src operands and the // memory being operated on must match // for HSAIL atomics - this assumption may // not apply to all ISAs typedef typename MemDataType::CType CType; Wavefront *w = gpuDynInst->wavefront(); int dst = this->dest.regIndex(); std::vector regVec; // virtual->physical VGPR mapping int physVgpr = w->remap(dst, sizeof(CType), 1); regVec.push_back(physVgpr); CType *p1 = &((CType*)gpuDynInst->d_data)[0]; for (int i = 0; i < w->computeUnit->wfSize(); ++i) { if (gpuDynInst->exec_mask[i]) { DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " "$%s%d <- %d global ld done (src = wavefront " "ld inst)\n", w->computeUnit->cu_id, w->simdId, w->wfSlotId, i, sizeof(CType) == 4 ? "s" : "d", dst, *p1); // write the value into the physical VGPR. This is a // purely functional operation. No timing is modeled. w->computeUnit->vrf[w->simdId]->write(physVgpr, *p1, i); } ++p1; } // Schedule the write operation of the load data on the VRF. // This simply models the timing aspect of the VRF write operation. // It does not modify the physical VGPR. int loadVrfBankConflictCycles = gpuDynInst->computeUnit()-> vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec, sizeof(CType), gpuDynInst->time); if (this->isGlobalMem()) { gpuDynInst->computeUnit()->globalMemoryPipe .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); } else { assert(this->isLocalMem()); gpuDynInst->computeUnit()->localMemoryPipe .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles); } } } void execute(GPUDynInstPtr gpuDynInst) override; private: // execAtomic may be called through a continuation // if the RMW had release semantics. see comment for // execContinuation in gpu_dyn_inst.hh void execAtomic(GPUDynInstPtr gpuDynInst) override { gpuDynInst->statusBitVector = gpuDynInst->exec_mask; typedef typename MemDataType::CType c0; c0 *d = &((c0*) gpuDynInst->d_data)[0]; c0 *e = &((c0*) gpuDynInst->a_data)[0]; c0 *f = &((c0*) gpuDynInst->x_data)[0]; for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) { if (gpuDynInst->exec_mask[i]) { Addr vaddr = gpuDynInst->addr[i]; if (this->isLocalMem()) { Wavefront *wavefront = gpuDynInst->wavefront(); *d = wavefront->ldsChunk->read(vaddr); if (this->isAtomicAdd()) { wavefront->ldsChunk->write(vaddr, wavefront->ldsChunk->read(vaddr) + (*e)); } else if (this->isAtomicSub()) { wavefront->ldsChunk->write(vaddr, wavefront->ldsChunk->read(vaddr) - (*e)); } else if (this->isAtomicMax()) { wavefront->ldsChunk->write(vaddr, std::max(wavefront->ldsChunk->read(vaddr), (*e))); } else if (this->isAtomicMin()) { wavefront->ldsChunk->write(vaddr, std::min(wavefront->ldsChunk->read(vaddr), (*e))); } else if (this->isAtomicAnd()) { wavefront->ldsChunk->write(vaddr, wavefront->ldsChunk->read(vaddr) & (*e)); } else if (this->isAtomicOr()) { wavefront->ldsChunk->write(vaddr, wavefront->ldsChunk->read(vaddr) | (*e)); } else if (this->isAtomicXor()) { wavefront->ldsChunk->write(vaddr, wavefront->ldsChunk->read(vaddr) ^ (*e)); } else if (this->isAtomicInc()) { wavefront->ldsChunk->write(vaddr, wavefront->ldsChunk->read(vaddr) + 1); } else if (this->isAtomicDec()) { wavefront->ldsChunk->write(vaddr, wavefront->ldsChunk->read(vaddr) - 1); } else if (this->isAtomicExch()) { wavefront->ldsChunk->write(vaddr, (*e)); } else if (this->isAtomicCAS()) { wavefront->ldsChunk->write(vaddr, (wavefront->ldsChunk->read(vaddr) == (*e)) ? (*f) : wavefront->ldsChunk->read(vaddr)); } else { fatal("Unrecognized or invalid HSAIL atomic op " "type.\n"); } } else { RequestPtr req = std::make_shared(0, vaddr, sizeof(c0), 0, gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId, gpuDynInst->makeAtomicOpFunctor(e, f)); gpuDynInst->setRequestFlags(req); PacketPtr pkt = new Packet(req, MemCmd::SwapReq); pkt->dataStatic(d); if (gpuDynInst->computeUnit()->shader-> separate_acquire_release && (gpuDynInst->isAcquire())) { // if this atomic has acquire semantics, // schedule the continuation to perform an // acquire after the RMW completes gpuDynInst->execContinuation = &GPUStaticInst::execAtomicAcq; gpuDynInst->useContinuation = true; } else { // the request will be finished when the RMW completes gpuDynInst->useContinuation = false; } // translation is performed in sendRequest() gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i, pkt); } } ++d; ++e; ++f; } gpuDynInst->updateStats(); } // execAtomicACq will always be called through a continuation. // see comment for execContinuation in gpu_dyn_inst.hh void execAtomicAcq(GPUDynInstPtr gpuDynInst) override { // after performing the RMW, check to see if this instruction // has acquire semantics, and if so, issue an acquire if (!this->isLocalMem()) { if (gpuDynInst->computeUnit()->shader->separate_acquire_release && gpuDynInst->isAcquire()) { gpuDynInst->statusBitVector = VectorMask(1); // the request will be finished when // the acquire completes gpuDynInst->useContinuation = false; // create request RequestPtr req = std::make_shared(0, 0, 0, 0, gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId); req->setFlags(Request::ACQUIRE); gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); } } } }; template GPUStaticInst* constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj) { const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) { return decodeLd(ib, obj); } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) { switch (ib->type) { case Brig::BRIG_TYPE_B8: return decodeSt(ib, obj); case Brig::BRIG_TYPE_B16: return decodeSt(ib, obj); case Brig::BRIG_TYPE_B32: return decodeSt(ib, obj); case Brig::BRIG_TYPE_B64: return decodeSt(ib, obj); default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type); } } else { if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) return new AtomicInst(ib, obj, "atomicnoret"); else return new AtomicInst(ib, obj, "atomic"); } } template GPUStaticInst* decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj) { unsigned addrIndex = (Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1; unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex); BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { return constructAtomic(ib, obj); } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { // V2/V4 not allowed switch (tmp.regKind) { case Brig::BRIG_REGISTER_KIND_SINGLE: return constructAtomic(ib, obj); case Brig::BRIG_REGISTER_KIND_DOUBLE: return constructAtomic(ib, obj); default: fatal("Bad atomic register operand type %d\n", tmp.type); } } else { fatal("Bad atomic register operand kind %d\n", tmp.kind); } } template GPUStaticInst* decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj) { const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) { return decodeAtomicHelper(ib, obj); } else { return decodeAtomicHelper(ib, obj); } } template GPUStaticInst* decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj) { const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) { return decodeAtomicHelper(ib, obj); } else { return decodeAtomicHelper(ib, obj); } } } // namespace HsailISA #endif // __ARCH_HSAIL_INSTS_MEM_HH__